bot50 2024-03-24 20:15:17 +00:00
commit 401b8fef15
1 changed files with 100 additions and 0 deletions

100
dna.py Normal file
View File

@ -0,0 +1,100 @@
import csv
import sys
def main():
# Check for command-line usage
if len(sys.argv) != 3:
print("Usage: python dna.py databases/*.csv sequences/*.txt")
sys.exit(1)
# Read database file into a variable
rows = []
with open(sys.argv[1]) as db:
for row in db:
rows.append(row)
# Extract subsequences, removing name "pop(0)" and newline ".strip("n")"
rows[0] = rows[0].strip("\n")
subsequences = rows[0].split(',')
subsequences.pop(0)
# Read DNA sequence file into a variable
with open(sys.argv[2]) as seq:
sequence = csv.DictReader(seq)
sequence = sequence.fieldnames[0]
# Find longest match of each STR in DNA sequence
longest_list = []
for i in range(len(subsequences)):
longest = longest_match(sequence, subsequences[i])
longest_list.append(longest)
# Check database for matching profiles
suspects = []
suspect = []
# Create a list of "suspects"
for s in range(1, len(rows)):
suspects.append(rows[s].strip("\n"))
# Iterate over eachs "suspect"
for s in range(len(suspects)):
suspect.append(suspects[s].split(','))
for m in range(len(suspects)):
match = 0
# Iterate over each subsequence
for n in range(len(longest_list)):
if (int(suspect[m][n + 1]) == longest_list[n]):
match += 1
if (len(longest_list) == match):
print(suspect[m][0])
sys.exit()
print("No match")
return
def longest_match(sequence, subsequence):
"""Returns length of longest run of subsequence in sequence."""
# Initialize variables
longest_run = 0
subsequence_length = len(subsequence)
sequence_length = len(sequence)
# Check each character in sequence for most consecutive runs of subsequence
for i in range(sequence_length):
# Initialize count of consecutive runs
count = 0
# Check for a subsequence match in a "substring" (a subset of characters) within sequence
# If a match, move substring to next potential match in sequence
# Continue moving substring and checking for matches until out of consecutive matches
while True:
# Adjust substring start and end
start = i + count * subsequence_length
end = start + subsequence_length
# If there is a match in the substring
if sequence[start:end] == subsequence:
count += 1
# If there is no match in the substring
else:
break
# Update most consecutive matches found
longest_run = max(longest_run, count)
# After checking for runs at each character in seqeuence, return longest run found
return longest_run
main()