cs50/dna.py

import csv
import sys


def main():

    #  Check for command-line usage
    if len(sys.argv) != 3:
        print("Usage: python dna.py databases/*.csv sequences/*.txt")
        sys.exit(1)

    #  Read database file into a variable
    rows = []
    with open(sys.argv[1]) as db:
        for row in db:
            rows.append(row)

        #  Extract subsequences, removing name "pop(0)" and newline ".strip("n")"
        rows[0] = rows[0].strip("\n")
        subsequences = rows[0].split(',')
        subsequences.pop(0)

    #  Read DNA sequence file into a variable
    with open(sys.argv[2]) as seq:
        sequence = csv.DictReader(seq)
        sequence = sequence.fieldnames[0]

    #  Find longest match of each STR in DNA sequence
    longest_list = []
    for i in range(len(subsequences)):
        longest = longest_match(sequence, subsequences[i])
        longest_list.append(longest)

    #  Check database for matching profiles
    suspects = []
    suspect = []

    #  Create a list of "suspects"
    for s in range(1, len(rows)):
        suspects.append(rows[s].strip("\n"))

    #  Iterate over eachs "suspect"
    for s in range(len(suspects)):
        suspect.append(suspects[s].split(','))

    for m in range(len(suspects)):
        match = 0
    #  Iterate over each subsequence
        for n in range(len(longest_list)):
            if (int(suspect[m][n + 1]) == longest_list[n]):
                match += 1

        if (len(longest_list) == match):
            print(suspect[m][0])
            sys.exit()

    print("No match")

    return


def longest_match(sequence, subsequence):
    """Returns length of longest run of subsequence in sequence."""

    # Initialize variables
    longest_run = 0
    subsequence_length = len(subsequence)
    sequence_length = len(sequence)

    # Check each character in sequence for most consecutive runs of subsequence
    for i in range(sequence_length):

        # Initialize count of consecutive runs
        count = 0

        # Check for a subsequence match in a "substring" (a subset of characters) within sequence
        # If a match, move substring to next potential match in sequence
        # Continue moving substring and checking for matches until out of consecutive matches
        while True:

            # Adjust substring start and end
            start = i + count * subsequence_length
            end = start + subsequence_length

            # If there is a match in the substring
            if sequence[start:end] == subsequence:
                count += 1

            # If there is no match in the substring
            else:
                break

        # Update most consecutive matches found
        longest_run = max(longest_run, count)

    # After checking for runs at each character in seqeuence, return longest run found
    return longest_run


main()
automated commit by check50 [check50=True] 2024-03-24 22:14:38 +02:00			`import csv`
			`import sys`


			`def main():`

			`# Check for command-line usage`
			`if len(sys.argv) != 3:`
			`print("Usage: python dna.py databases/.csv sequences/.txt")`
			`sys.exit(1)`

			`# Read database file into a variable`
			`rows = []`
			`with open(sys.argv[1]) as db:`
			`for row in db:`
			`rows.append(row)`

			`# Extract subsequences, removing name "pop(0)" and newline ".strip("n")"`
			`rows[0] = rows[0].strip("\n")`
			`subsequences = rows[0].split(',')`
			`subsequences.pop(0)`

			`# Read DNA sequence file into a variable`
			`with open(sys.argv[2]) as seq:`
			`sequence = csv.DictReader(seq)`
			`sequence = sequence.fieldnames[0]`

			`# Find longest match of each STR in DNA sequence`
			`longest_list = []`
			`for i in range(len(subsequences)):`
			`longest = longest_match(sequence, subsequences[i])`
			`longest_list.append(longest)`

			`# Check database for matching profiles`
			`suspects = []`
			`suspect = []`

			`# Create a list of "suspects"`
			`for s in range(1, len(rows)):`
			`suspects.append(rows[s].strip("\n"))`

			`# Iterate over eachs "suspect"`
			`for s in range(len(suspects)):`
			`suspect.append(suspects[s].split(','))`

			`for m in range(len(suspects)):`
			`match = 0`
			`# Iterate over each subsequence`
			`for n in range(len(longest_list)):`
			`if (int(suspect[m][n + 1]) == longest_list[n]):`
			`match += 1`

			`if (len(longest_list) == match):`
			`print(suspect[m][0])`
			`sys.exit()`

			`print("No match")`

			`return`


			`def longest_match(sequence, subsequence):`
			`"""Returns length of longest run of subsequence in sequence."""`

			`# Initialize variables`
			`longest_run = 0`
			`subsequence_length = len(subsequence)`
			`sequence_length = len(sequence)`

			`# Check each character in sequence for most consecutive runs of subsequence`
			`for i in range(sequence_length):`

			`# Initialize count of consecutive runs`
			`count = 0`

			`# Check for a subsequence match in a "substring" (a subset of characters) within sequence`
			`# If a match, move substring to next potential match in sequence`
			`# Continue moving substring and checking for matches until out of consecutive matches`
			`while True:`

			`# Adjust substring start and end`
			`start = i + count * subsequence_length`
			`end = start + subsequence_length`

			`# If there is a match in the substring`
			`if sequence[start:end] == subsequence:`
			`count += 1`

			`# If there is no match in the substring`
			`else:`
			`break`

			`# Update most consecutive matches found`
			`longest_run = max(longest_run, count)`

			`# After checking for runs at each character in seqeuence, return longest run found`
			`return longest_run`


			`main()`