mirror of https://github.com/me50/kukemuna.git
This commit is contained in:
commit
4ce65f086a
|
|
@ -0,0 +1,100 @@
|
|||
import csv
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
# Check for command-line usage
|
||||
if len(sys.argv) != 3:
|
||||
print("Usage: python dna.py databases/*.csv sequences/*.txt")
|
||||
sys.exit(1)
|
||||
|
||||
# Read database file into a variable
|
||||
rows = []
|
||||
with open(sys.argv[1]) as db:
|
||||
for row in db:
|
||||
rows.append(row)
|
||||
|
||||
# Extract subsequences, removing name "pop(0)" and newline ".strip("n")"
|
||||
rows[0] = rows[0].strip("\n")
|
||||
subsequences = rows[0].split(',')
|
||||
subsequences.pop(0)
|
||||
|
||||
# Read DNA sequence file into a variable
|
||||
with open(sys.argv[2]) as seq:
|
||||
sequence = csv.DictReader(seq)
|
||||
sequence = sequence.fieldnames[0]
|
||||
|
||||
# Find longest match of each STR in DNA sequence
|
||||
longest_list = []
|
||||
for i in range(len(subsequences)):
|
||||
longest = longest_match(sequence, subsequences[i])
|
||||
longest_list.append(longest)
|
||||
|
||||
# Check database for matching profiles
|
||||
suspects = []
|
||||
suspect = []
|
||||
|
||||
# Create a list of "suspects"
|
||||
for s in range(1, len(rows)):
|
||||
suspects.append(rows[s].strip("\n"))
|
||||
|
||||
# Iterate over eachs "suspect"
|
||||
for s in range(len(suspects)):
|
||||
suspect.append(suspects[s].split(','))
|
||||
|
||||
for m in range(len(suspects)):
|
||||
match = 0
|
||||
# Iterate over each subsequence
|
||||
for n in range(len(longest_list)):
|
||||
if (int(suspect[m][n + 1]) == longest_list[n]):
|
||||
match += 1
|
||||
|
||||
if (len(longest_list) == match):
|
||||
print(suspect[m][0])
|
||||
sys.exit()
|
||||
|
||||
print("No match")
|
||||
|
||||
return
|
||||
|
||||
|
||||
def longest_match(sequence, subsequence):
|
||||
"""Returns length of longest run of subsequence in sequence."""
|
||||
|
||||
# Initialize variables
|
||||
longest_run = 0
|
||||
subsequence_length = len(subsequence)
|
||||
sequence_length = len(sequence)
|
||||
|
||||
# Check each character in sequence for most consecutive runs of subsequence
|
||||
for i in range(sequence_length):
|
||||
|
||||
# Initialize count of consecutive runs
|
||||
count = 0
|
||||
|
||||
# Check for a subsequence match in a "substring" (a subset of characters) within sequence
|
||||
# If a match, move substring to next potential match in sequence
|
||||
# Continue moving substring and checking for matches until out of consecutive matches
|
||||
while True:
|
||||
|
||||
# Adjust substring start and end
|
||||
start = i + count * subsequence_length
|
||||
end = start + subsequence_length
|
||||
|
||||
# If there is a match in the substring
|
||||
if sequence[start:end] == subsequence:
|
||||
count += 1
|
||||
|
||||
# If there is no match in the substring
|
||||
else:
|
||||
break
|
||||
|
||||
# Update most consecutive matches found
|
||||
longest_run = max(longest_run, count)
|
||||
|
||||
# After checking for runs at each character in seqeuence, return longest run found
|
||||
return longest_run
|
||||
|
||||
|
||||
main()
|
||||
Loading…
Reference in New Issue