mirror of https://github.com/me50/kukemuna.git
101 lines
2.8 KiB
Python
101 lines
2.8 KiB
Python
|
|
import csv
|
||
|
|
import sys
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
|
||
|
|
# Check for command-line usage
|
||
|
|
if len(sys.argv) != 3:
|
||
|
|
print("Usage: python dna.py databases/*.csv sequences/*.txt")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
# Read database file into a variable
|
||
|
|
rows = []
|
||
|
|
with open(sys.argv[1]) as db:
|
||
|
|
for row in db:
|
||
|
|
rows.append(row)
|
||
|
|
|
||
|
|
# Extract subsequences, removing name "pop(0)" and newline ".strip("n")"
|
||
|
|
rows[0] = rows[0].strip("\n")
|
||
|
|
subsequences = rows[0].split(',')
|
||
|
|
subsequences.pop(0)
|
||
|
|
|
||
|
|
# Read DNA sequence file into a variable
|
||
|
|
with open(sys.argv[2]) as seq:
|
||
|
|
sequence = csv.DictReader(seq)
|
||
|
|
sequence = sequence.fieldnames[0]
|
||
|
|
|
||
|
|
# Find longest match of each STR in DNA sequence
|
||
|
|
longest_list = []
|
||
|
|
for i in range(len(subsequences)):
|
||
|
|
longest = longest_match(sequence, subsequences[i])
|
||
|
|
longest_list.append(longest)
|
||
|
|
|
||
|
|
# Check database for matching profiles
|
||
|
|
suspects = []
|
||
|
|
suspect = []
|
||
|
|
|
||
|
|
# Create a list of "suspects"
|
||
|
|
for s in range(1, len(rows)):
|
||
|
|
suspects.append(rows[s].strip("\n"))
|
||
|
|
|
||
|
|
# Iterate over eachs "suspect"
|
||
|
|
for s in range(len(suspects)):
|
||
|
|
suspect.append(suspects[s].split(','))
|
||
|
|
|
||
|
|
for m in range(len(suspects)):
|
||
|
|
match = 0
|
||
|
|
# Iterate over each subsequence
|
||
|
|
for n in range(len(longest_list)):
|
||
|
|
if (int(suspect[m][n + 1]) == longest_list[n]):
|
||
|
|
match += 1
|
||
|
|
|
||
|
|
if (len(longest_list) == match):
|
||
|
|
print(suspect[m][0])
|
||
|
|
sys.exit()
|
||
|
|
|
||
|
|
print("No match")
|
||
|
|
|
||
|
|
return
|
||
|
|
|
||
|
|
|
||
|
|
def longest_match(sequence, subsequence):
|
||
|
|
"""Returns length of longest run of subsequence in sequence."""
|
||
|
|
|
||
|
|
# Initialize variables
|
||
|
|
longest_run = 0
|
||
|
|
subsequence_length = len(subsequence)
|
||
|
|
sequence_length = len(sequence)
|
||
|
|
|
||
|
|
# Check each character in sequence for most consecutive runs of subsequence
|
||
|
|
for i in range(sequence_length):
|
||
|
|
|
||
|
|
# Initialize count of consecutive runs
|
||
|
|
count = 0
|
||
|
|
|
||
|
|
# Check for a subsequence match in a "substring" (a subset of characters) within sequence
|
||
|
|
# If a match, move substring to next potential match in sequence
|
||
|
|
# Continue moving substring and checking for matches until out of consecutive matches
|
||
|
|
while True:
|
||
|
|
|
||
|
|
# Adjust substring start and end
|
||
|
|
start = i + count * subsequence_length
|
||
|
|
end = start + subsequence_length
|
||
|
|
|
||
|
|
# If there is a match in the substring
|
||
|
|
if sequence[start:end] == subsequence:
|
||
|
|
count += 1
|
||
|
|
|
||
|
|
# If there is no match in the substring
|
||
|
|
else:
|
||
|
|
break
|
||
|
|
|
||
|
|
# Update most consecutive matches found
|
||
|
|
longest_run = max(longest_run, count)
|
||
|
|
|
||
|
|
# After checking for runs at each character in seqeuence, return longest run found
|
||
|
|
return longest_run
|
||
|
|
|
||
|
|
|
||
|
|
main()
|