import csv import sys def main(): # Check for command-line usage if len(sys.argv) != 3: print("Usage: python dna.py databases/*.csv sequences/*.txt") sys.exit(1) # Read database file into a variable rows = [] with open(sys.argv[1]) as db: for row in db: rows.append(row) # Extract subsequences, removing name "pop(0)" and newline ".strip("n")" rows[0] = rows[0].strip("\n") subsequences = rows[0].split(',') subsequences.pop(0) # Read DNA sequence file into a variable with open(sys.argv[2]) as seq: sequence = csv.DictReader(seq) sequence = sequence.fieldnames[0] # Find longest match of each STR in DNA sequence longest_list = [] for i in range(len(subsequences)): longest = longest_match(sequence, subsequences[i]) longest_list.append(longest) # Check database for matching profiles suspects = [] suspect = [] # Create a list of "suspects" for s in range(1, len(rows)): suspects.append(rows[s].strip("\n")) # Iterate over eachs "suspect" for s in range(len(suspects)): suspect.append(suspects[s].split(',')) for m in range(len(suspects)): match = 0 # Iterate over each subsequence for n in range(len(longest_list)): if (int(suspect[m][n + 1]) == longest_list[n]): match += 1 if (len(longest_list) == match): print(suspect[m][0]) sys.exit() print("No match") return def longest_match(sequence, subsequence): """Returns length of longest run of subsequence in sequence.""" # Initialize variables longest_run = 0 subsequence_length = len(subsequence) sequence_length = len(sequence) # Check each character in sequence for most consecutive runs of subsequence for i in range(sequence_length): # Initialize count of consecutive runs count = 0 # Check for a subsequence match in a "substring" (a subset of characters) within sequence # If a match, move substring to next potential match in sequence # Continue moving substring and checking for matches until out of consecutive matches while True: # Adjust substring start and end start = i + count * subsequence_length end = start + subsequence_length # If there is a match in the substring if sequence[start:end] == subsequence: count += 1 # If there is no match in the substring else: break # Update most consecutive matches found longest_run = max(longest_run, count) # After checking for runs at each character in seqeuence, return longest run found return longest_run main()