About CS50 Pset6 DNA, it overcounts STR for large.cvs

About CS50 Pset6 DNA, it overcounts STR for large.cvs - python

I'm working on pset6, DNA problem. This code is working for small.cvs but when I try the large one it overestimates the STR count. I guess the problem is when it tries to compare strings. But still don't know how to fix it. I checked that the counting is correct for the "TTTTTTCT" sequence but for the remaining STRs, the counting is in all cases larger than it should.
import sys
import csv
def main():
while (len(sys.argv) != 3):
print ("ERROR. Usage: python dna.py data.csv sequence.txt")
break
list_str = {}
#load the STRs to analyse
with open(sys.argv[1]) as csvfile:
readcsv = csv.reader (csvfile)
ncol = len(next(readcsv))
csvfile.seek(0)
header = list()
for line in readcsv:
a = sum(1 for line in readcsv)
for i in range(ncol):
list_str[line[i]] = 0
header.insert (i, line [i])
print (f"{header[i]}")
#open an work with the sequence file
sequence = open(sys.argv[2], 'r')
seq_r = sequence.read()
for k in list_str.keys():
#print (f"keu {k}")
p = 0
seq = len(seq_r)
while p < seq:
if seq_r[p:(p + len(k))] == k:
list_str[k] += 1
p += len(k)
else: p += 1
#print (f" sequenci encontrada{list_str[k]} y {k}")
print (f"nro de {k} {list_str[k]}")
with open(sys.argv[1]) as csvfile:
readcsv = csv.reader (csvfile)
next(csvfile)
find = False
for row in readcsv:
for j in range(1,ncol):
#print(f"header :{header[j]}")
if int(row [j]) == int(list_str[header[j]]):
print (f"row {row[j]} list {list_str[header[j]]}")
find = True
else:
find = False
break
if find == True: print (f"{row [0]}")
main()

The same thing happened to me, and then I saw the specifications of the pset.
We need to find the "longest run of consecutive repeats of the STR". Not the total count of STRs. It works for the small.csv as in my case too, so try to search for the longest consecutive occurrences of the specific STR.

Related

naive algorthim string matching

import pandas as pd
import csv
def naive_string_matching(text, pattern):
n = len(text)
m = len(pattern)
for i in range(n - m + 1):
j = 0
while j < m and text[i + j] == pattern[j]:
j += 1
if j == m:
return i
return -1
# Initialize an empty list to store the row strings
row_strings = []
# Open the CSV file for reading
with open("pcr_data.csv", 'r') as csv_file:
# Create a CSV reader object
reader = csv.reader(csv_file)
# Iterate through the rows in the CSV file
for i, row in enumerate(reader):
# Convert the row to a string
row_string = ''.join(row)
# Append the row string to the list
row_strings.append(row_string)
print('line[{}] = {}'.format(i, row_strings))
def search_in_file(text, pattern):
index = naive_string_matching(text, pattern)
if index != -1:
print("The pattern found at index: ", index)
else:
print("The pattern was not found in the file")
search_in_file(row_strings,'01001111110')
my data in cvs file
#from this [10101010,10101010] to:
#'01011111001', '11101111111', '11101111011', '11101111011', #'11011111011', '10101111001',
I have CVS file with 1740 rows and 11 col, and i store the data in list of string and convert from 001010101 to string with remove( , )
but i have problem here the orginal algothrtim doesnot work and give me this error
TypeError: object of type '_io.TextIOWrapper' has no len()
and it cannot find the match pattern.
I try use pandas for reading but also give me wrong result
please help me

How to transform a csv file into a multi-dimensional list using Python?

I started out with a 4d list, something like
tokens = [[[["a"], ["b"], ["c"]], [["d"]]], [[["e"], ["f"], ["g"]],[["h"], ["i"], ["j"], ["k"], ["l"]]]]
So I converted this to a csv file using the code
import csv
def export_to_csv(tokens):
csv_list = [["A", "B", "C", word]]
for h_index, h in enumerate(tokens):
for i_index, i in enumerate(h):
for j_index, j in enumerate(i):
csv_list.append([h_index, i_index, j_index, j])
with open('TEST.csv', 'w') as f:
# using csv.writer method from CSV package
write = csv.writer(f)
write.writerows(csv_list)
But now I want to do the reverse process, want to convert a csv file obtained in this format, back to the list format mentioned above.

Assuming you wanted your csv file to look something like this (there were a couple typos in the posted code):
A,B,C,word
0,0,0,a
0,0,1,b
0,0,2,c
...
here's one solution:
import csv
def import_from_csv(filename):
retval = []
with open(filename) as fh:
reader = csv.reader(fh)
# discard header row
next(reader)
# process data rows
for (x,y,z,word) in reader:
x = int(x)
y = int(y)
z = int(z)
retval.extend([[[]]] * (x + 1 - len(retval)))
retval[x].extend([[]] * (y + 1 - len(retval[x])))
retval[x][y].extend([0] * (z + 1 - len(retval[x][y])))
retval[x][y][z] = [word]
return retval

def import_from_csv(file):
import ast
import csv
data = []
# Read the CSV file
with open(file) as fp:
reader = csv.reader(fp)
# Skip the first line, which contains the headers
next(reader)
for line in reader:
# Read the first 3 elements of the line
a, b, c = [int(i) for i in line[:3]]
# When we read it back, everything comes in as strings. Use
# `literal_eval` to convert it to a Python list
value = ast.literal_eval(line[3])
# Extend the list to accomodate the new element
data.append([[[]]]) if len(data) < a + 1 else None
data[a].append([[]]) if len(data[a]) < b + 1 else None
data[a][b].append([]) if len(data[a][b]) < c + 1 else None
data[a][b][c] = value
return data
# Test
assert import_from_csv("TEST.csv") == tokens

First, I'd make writing this construction in a CSV format independent from dimensions:
import csv
def deep_iter(seq):
for i, val in enumerate(seq):
if type(val) is list:
for others in deep_iter(val):
yield i, *others
else:
yield i, val
with open('TEST.csv', 'w') as f:
csv.writer(f).writerows(deep_iter(tokens))
Next, we can use the lexicographic order of the indices to recreate the structure. All we have to do is sequentially move deeper into the output list according to the indices of a word. We stop at the penultimate index to get the last list, because the last index is pointing only at the place of the word in this list and doesn't matter due to the natural ordering:
with open('TEST.csv', 'r') as f:
rows = [*csv.reader(f)]
res = []
for r in rows:
index = r[:-2] # skip the last index and word
e = res
while index:
i = int(index.pop(0)) # get next part of a current index
if i < len(e):
e = e[i]
else:
e.append([]) # add new record at this level
e = e[-1]
e.append(r[-1]) # append the word to the corresponding list

CSV Python Outputting: Outputting non-matching field once rather than once for every item in list

I've been trying to figure this out for about a year now and I'm really burnt out on it so please excuse me if this explanation is a bit rough.
I cannot include job data, but it would be accurate to imagine 2 csv files both with the first column populated with values (Serial numbers/phone numbers/names, doesn't matter - just values). Between both csv files, some values would match while other values would only be contained in one or the other (Timmy is in both files and is a match, Robert is only in file 1 and does not match any name in file 2).
I can successfully output a csv value ONCE that exists in the both csv files (I.e. both files contain "Value78", output file will contain "Value78" only once).
When I try to tack on an else statement to my if condition, to handle non-matching items, the program will output 1 entry for every item it does not match with (makes 100% sense, matches happen once but every other comparison result besides the match is a non-match).
I cannot envision a structure or method to hold the fields that don't match back so that they can be output once and not overrun my terminal or output file.
My goal is to output two csv files, matches and non-matches, with the non-matches having only one entry per value.
Anyways, onto the code:
import csv
MYUNITS = 'MyUnits.csv'
VENDORUNITS = 'VendorUnits.csv'
MATCHES = 'Matches.csv'
NONMATCHES = 'NonMatches.csv'
with open(MYUNITS,mode='r') as MFile,
open(VENDORUNITS,mode='r') as VFile,
open(MATCHES,mode='w') as OFile,
open(NONMATCHES,mode'w') as NFile:
MyReader = csv.reader(MFile,delimiter=',',quotechar='"')
MyList = list(MyReader)
VendorReader = csv.reader(VFile,delimiter=',',quotechar='"')
VList = list(VendorReader)
for x in range(len(MyList)):
for y in range(len(VList)):
if str(MyList[x][0]) == str(VList[y][0]):
OFile.write(MyList[x][0] + '\n')
else:
pass
The "else: pass" is where the logic of filtering out non-matches is escaping me. Outputting from this else statement will write the non-matching value (len(VList) - 1) times for an iteration that DOES produce 1 match, the entire len(VList) for an iteration with no match. I've tried using a counter and only outputting if the counter equals the len(VList), (incrementing in the else statement, writing output under the scope of the second for loop), but received the same output as if I tried outputting non-matches.

Below is one way you might go about deduplicating and then writing to a file:
import csv
MYUNITS = 'MyUnits.csv'
VENDORUNITS = 'VendorUnits.csv'
MATCHES = 'Matches.csv'
NONMATCHES = 'NonMatches.csv'
list_of_non_matches = []
with open(MYUNITS,mode='r') as MFile,
open(VENDORUNITS,mode='r') as VFile,
open(MATCHES,mode='w') as OFile,
open(NONMATCHES,mode'w') as NFile:
MyReader = csv.reader(MFile,delimiter=',',quotechar='"')
MyList = list(MyReader)
VendorReader = csv.reader(VFile,delimiter=',',quotechar='"')
VList = list(VendorReader)
for x in range(len(MyList)):
for y in range(len(VList)):
if str(MyList[x][0]) == str(VList[y][0]):
OFile.write(MyList[x][0] + '\n')
else:
list_of_non_matches.append(MyList[x][0])
# Remove duplicates from the non matches
new_list = []
[new_list.append(x) for x in list_of_non_matches if x not in new_list]
# Write the new list to a file
for i in new_list:
NFile.write(i + '\n')

Does this work?
import csv
MYUNITS = 'MyUnits.csv'
VENDORUNITS = 'VendorUnits.csv'
MATCHES = 'Matches.csv'
NONMATCHES = 'NonMatches.csv'
with open(MYUNITS,'r') as MFile,
(VENDORUNITS,'r') as VFile,
(MATCHES,'w') as OFile,
(NONMATCHES,mode,'w') as NFile:
MyReader = csv.reader(MFile,delimiter=',',quotechar='"')
MyList = list(MyReader)
MyVals = [x for x in MyList]
MyVals = [x[0] for x in MyVals]
VendorReader = csv.reader(VFile,delimiter=',',quotechar='"')
VList = list(VendorReader)
vVals = [x for x in VList]
vVals = [x[0] for x in vVals]
for val in MyVals:
if val in vVals:
OFile.write(Val + '\n')
else:
NFile.write(Val + '\n')
#for x in range(len(MyList)):
# for y in range(len(VList)):
# if str(MyList[x][0]) == str(VList[y][0]):
# OFile.write(MyList[x][0] + '\n')
# else:
# pass

Sorry, I had some issues with my PC. I was able to solve my own question the night I posted. The solution I used is so simple I'm kicking myself for not figuring it out way sooner:
import csv
MYUNITS = 'MyUnits.csv'
VENDORUNITS = 'VendorUnits.csv'
MATCHES = 'Matches.csv'
NONMATCHES = 'NonMatches.csv'
with open(MYUNITS,mode='r') as MFile,
open(VENDORUNITS,mode='r') as VFile,
open(MATCHES,mode='w') as OFile,
open(NONMATCHES,mode'w') as NFile:
MyReader = csv.reader(MFile,delimiter=',',quotechar='"')
MyList = list(MyReader)
VendorReader = csv.reader(VFile,delimiter=',',quotechar='"')
VList = list(VendorReader)
for x in range(len(MyList)):
tmpStr = ''
for y in range(len(VList)):
if str(MyList[x][0]) == str(VList[y][0]):
tmpStr = '' #Sets to blank so comparison fails, works because break
OFile.write(MyList[x][0] + '\n')
break
else:
tmp = str(MyList[x][0])
if tmp != '':
NFile.write(tmp + '\n')

Comparison of two elements in different array

My problem:
I am trying to compare two elements from two different arrays but the operator is not working.
Code Snippet in question:
for i in range(row_length):
print(f"ss_record: {ss_record[i]}")
print(f"row: {row[i + 1]}")
#THIS IF STATEMENT IS NOT WORKING
if ss_record[i] == row[i + 1]:
count += 1
#print()
#print(f"row length: {row_length}")
#print(f"count: {count}")
if count == row_length:
print(row[0])
exit(0)
What I have done: I tried to print the value of ss_record and row before it runs through the if statement but when it matches, count doesn't increase. I tried storing the value of row in a new array but it bugs out and only store the array length and first 2 value of row and repeats those values every next instance.
What I think the issue: I think the issue with my code is that row is being read from a CSV file and is not being converted into an integer as a result, it appears they are the same but one is an integer while the other is a string.
Entire Code:
import csv
import sys
import re
from cs50 import get_string
from sys import argv
def main():
line_count = 0
if len(argv) != 3:
print("missing command-line argument")
exit(1)
with open(sys.argv[1], 'r') as database:
sequence = open(sys.argv[2], 'r')
string = sequence.read()
reader = csv.reader(database, delimiter = ',')
for row in reader:
if line_count == 0:
row_length = len(row) - 1
ss_record = [row_length]
for i in range(row_length):
ss_record.append(ss_count(string, row[i + 1], len(row[i + 1])))
ss_record.pop(0)
line_count = 1
else:
count = 0
for i in range(row_length):
print(f"ss_record: {ss_record[i]}")
print(f"row: {row[i + 1]}")
#THIS IF STATEMENT IS NOT WORKING
if ss_record[i] == row[i + 1]:
count += 1
if count == row_length:
print(row[0])
exit(0)
#ss_count mean the # of times the substring appear in the string
def ss_count(string, substring, length):
count = 1
record = 0
pos_array = []
for m in re.finditer(substring, string):
pos_array.append(m.start())
for i in range(len(pos_array) - 1):
if pos_array[i + 1] - pos_array[i] == length:
count += 1
else:
if count > record:
record = count
count = 1
if count > record:
record = count
return record
main()
Values to use to reproduce issue:
sequence (this is a text file) = AAGGTAAGTTTAGAATATAAAAGGTGAGTTAAATAGAATAGGTTAAAATTAAAGGAGATCAGATCAGATCAGATCTATCTATCTATCTATCTATCAGAAAAGAGTAAATAGTTAAAGAGTAAGATATTGAATTAATGGAAAATATTGTTGGGGAAAGGAGGGATAGAAGG
substring (this is a csv file) =
name,AGATC,AATG,TATC
Alice,2,8,3
Bob,4,1,5
Charlie,3,2,5
Gist of the CSV file:
The numbers beside Alice means how many times a substring(STR/Short Tandem Repeat) appears in a row in the string(DNA sequence). In this string, AGATC appears 4 times in a row, AATG appears 1 time in a row, and TATC appears 5 times in a row. For this DNA sequence, it matches Bob and he outputted as the answer.

You were right, when you compare ss_record[i] == row[i + 1]: there is a type problem, the numbers of ss_record are integers while the numbers of the row are strings. You may acknowledge the issue by printing both ss_record and row:
print("ss_record: {}".format(ss_record)) -> ss_record: [4, 1, 5]
print("row: {}".format(row)) -> row: ['Alice', '2', '8', '3']
In order for the snippet to work you just need to change the comparison to
ss_record[i] == int(row[i + 1])
That said, I feel the code is quite complex for the task. The string class implements a count method that returns the number of non-overlapping occurrences of a given substring. Also, since the code it's working in an item basis and relies heavily in index manipulations the iteration logic is hard to follow (IMO). Here's my approach to the problem:
import csv
def match_user(dna_file, user_csv):
with open(dna_file, 'r') as r:
dna_seq = r.readline()
with open(user_csv, 'r') as r:
reader = csv.reader(r)
rows = list(reader)
target_substrings = rows[0][1:]
users = rows[1:]
num_matches = [dna_seq.count(target) for target in target_substrings]
for user in users:
user_matches = [int(x) for x in user[1:]]
if user_matches == num_matches:
return user[0]
return "Not found"
Happy Coding!

How do you sort by ascending order from a txt file in python?

I am new to python and have a hopefully simple question. I have a txt file with road names and lengths. i.e. Main street 56 miles. I am stumped on how to call on that file in Python and sort all the road lengths in ascending order. Thanks for your help.

Let's assume that there is a "miles" after every number. (This is an untested code so you can get it edited but I think the idea is right).
EDIT: THIS IS TESTED
import collections
originaldict = {}
newdict = collections.OrderedDict()
def isnum(string):
try:
if string is ".":
return True
float(string)
return True
except Exception:
return False
for line in open(input_file, "r"):
string = line[:line.find("miles") - 1]
print string
curnum = ""
for c in reversed(string):
if not isnum(c):
break
curnum = c + curnum
originaldict[float(curnum)] = []
originaldict[float(curnum)].append(line)
for num in sorted(originaldict.iterkeys()):
newdict[num] = []
newdict[num].append(originaldict[num][0])
del originaldict[num][0]
with open(output_file, "a") as o:
for value in newdict.values():
for line in value:
o.write(line + "\n")

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

About CS50 Pset6 DNA, it overcounts STR for large.cvs - python

The same thing happened to me, and then I saw the specifications of the pset. We need to find the "longest run of consecutive repeats of the STR". Not the total count of STRs. It works for the small.csv as in my case too, so try to search for the longest consecutive occurrences of the specific STR.

Related

naive algorthim string matching

How to transform a csv file into a multi-dimensional list using Python?

CSV Python Outputting: Outputting non-matching field once rather than once for every item in list

Comparison of two elements in different array

How do you sort by ascending order from a txt file in python?

Categories

Resources