From 4ce65f086a5749ef7d4379309d3e1a76628e53b5 Mon Sep 17 00:00:00 2001 From: bot50 Date: Sun, 24 Mar 2024 20:14:42 +0000 Subject: [PATCH] kukemuna-cs50/problems/2024/x/dna@20240324T201442.425499293Z --- dna.py | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 dna.py diff --git a/dna.py b/dna.py new file mode 100644 index 0000000..dcaa6b5 --- /dev/null +++ b/dna.py @@ -0,0 +1,100 @@ +import csv +import sys + + +def main(): + + # Check for command-line usage + if len(sys.argv) != 3: + print("Usage: python dna.py databases/*.csv sequences/*.txt") + sys.exit(1) + + # Read database file into a variable + rows = [] + with open(sys.argv[1]) as db: + for row in db: + rows.append(row) + + # Extract subsequences, removing name "pop(0)" and newline ".strip("n")" + rows[0] = rows[0].strip("\n") + subsequences = rows[0].split(',') + subsequences.pop(0) + + # Read DNA sequence file into a variable + with open(sys.argv[2]) as seq: + sequence = csv.DictReader(seq) + sequence = sequence.fieldnames[0] + + # Find longest match of each STR in DNA sequence + longest_list = [] + for i in range(len(subsequences)): + longest = longest_match(sequence, subsequences[i]) + longest_list.append(longest) + + # Check database for matching profiles + suspects = [] + suspect = [] + + # Create a list of "suspects" + for s in range(1, len(rows)): + suspects.append(rows[s].strip("\n")) + + # Iterate over eachs "suspect" + for s in range(len(suspects)): + suspect.append(suspects[s].split(',')) + + for m in range(len(suspects)): + match = 0 + # Iterate over each subsequence + for n in range(len(longest_list)): + if (int(suspect[m][n + 1]) == longest_list[n]): + match += 1 + + if (len(longest_list) == match): + print(suspect[m][0]) + sys.exit() + + print("No match") + + return + + +def longest_match(sequence, subsequence): + """Returns length of longest run of subsequence in sequence.""" + + # Initialize variables + longest_run = 0 + subsequence_length = len(subsequence) + sequence_length = len(sequence) + + # Check each character in sequence for most consecutive runs of subsequence + for i in range(sequence_length): + + # Initialize count of consecutive runs + count = 0 + + # Check for a subsequence match in a "substring" (a subset of characters) within sequence + # If a match, move substring to next potential match in sequence + # Continue moving substring and checking for matches until out of consecutive matches + while True: + + # Adjust substring start and end + start = i + count * subsequence_length + end = start + subsequence_length + + # If there is a match in the substring + if sequence[start:end] == subsequence: + count += 1 + + # If there is no match in the substring + else: + break + + # Update most consecutive matches found + longest_run = max(longest_run, count) + + # After checking for runs at each character in seqeuence, return longest run found + return longest_run + + +main()