kukemuna-cs50/problems/2024/x/dna@20240324T201442.425499293Z

2024-03-24 20:14:42 +00:00 · 2024-03-24 20:14:42 +00:00 · 4ce65f086a
commit 4ce65f086a
1 changed files with 100 additions and 0 deletions
--- a/dna.py
+++ b/dna.py
@ -0,0 +1,100 @@
+import csv
+import sys
+
+
+def main():
+
+    #  Check for command-line usage
+    if len(sys.argv) != 3:
+        print("Usage: python dna.py databases/*.csv sequences/*.txt")
+        sys.exit(1)
+
+    #  Read database file into a variable
+    rows = []
+    with open(sys.argv[1]) as db:
+        for row in db:
+            rows.append(row)
+
+        #  Extract subsequences, removing name "pop(0)" and newline ".strip("n")"
+        rows[0] = rows[0].strip("\n")
+        subsequences = rows[0].split(',')
+        subsequences.pop(0)
+
+    #  Read DNA sequence file into a variable
+    with open(sys.argv[2]) as seq:
+        sequence = csv.DictReader(seq)
+        sequence = sequence.fieldnames[0]
+
+    #  Find longest match of each STR in DNA sequence
+    longest_list = []
+    for i in range(len(subsequences)):
+        longest = longest_match(sequence, subsequences[i])
+        longest_list.append(longest)
+
+    #  Check database for matching profiles
+    suspects = []
+    suspect = []
+
+    #  Create a list of "suspects"
+    for s in range(1, len(rows)):
+        suspects.append(rows[s].strip("\n"))
+
+    #  Iterate over eachs "suspect"
+    for s in range(len(suspects)):
+        suspect.append(suspects[s].split(','))
+
+    for m in range(len(suspects)):
+        match = 0
+    #  Iterate over each subsequence
+        for n in range(len(longest_list)):
+            if (int(suspect[m][n + 1]) == longest_list[n]):
+                match += 1
+
+        if (len(longest_list) == match):
+            print(suspect[m][0])
+            sys.exit()
+
+    print("No match")
+
+    return
+
+
+def longest_match(sequence, subsequence):
+    """Returns length of longest run of subsequence in sequence."""
+
+    # Initialize variables
+    longest_run = 0
+    subsequence_length = len(subsequence)
+    sequence_length = len(sequence)
+
+    # Check each character in sequence for most consecutive runs of subsequence
+    for i in range(sequence_length):
+
+        # Initialize count of consecutive runs
+        count = 0
+
+        # Check for a subsequence match in a "substring" (a subset of characters) within sequence
+        # If a match, move substring to next potential match in sequence
+        # Continue moving substring and checking for matches until out of consecutive matches
+        while True:
+
+            # Adjust substring start and end
+            start = i + count * subsequence_length
+            end = start + subsequence_length
+
+            # If there is a match in the substring
+            if sequence[start:end] == subsequence:
+                count += 1
+
+            # If there is no match in the substring
+            else:
+                break
+
+        # Update most consecutive matches found
+        longest_run = max(longest_run, count)
+
+    # After checking for runs at each character in seqeuence, return longest run found
+    return longest_run
+
+
+main()