naive algorthim string matching

naive algorthim string matching - python

import pandas as pd
import csv
def naive_string_matching(text, pattern):
n = len(text)
m = len(pattern)
for i in range(n - m + 1):
j = 0
while j < m and text[i + j] == pattern[j]:
j += 1
if j == m:
return i
return -1
# Initialize an empty list to store the row strings
row_strings = []
# Open the CSV file for reading
with open("pcr_data.csv", 'r') as csv_file:
# Create a CSV reader object
reader = csv.reader(csv_file)
# Iterate through the rows in the CSV file
for i, row in enumerate(reader):
# Convert the row to a string
row_string = ''.join(row)
# Append the row string to the list
row_strings.append(row_string)
print('line[{}] = {}'.format(i, row_strings))
def search_in_file(text, pattern):
index = naive_string_matching(text, pattern)
if index != -1:
print("The pattern found at index: ", index)
else:
print("The pattern was not found in the file")
search_in_file(row_strings,'01001111110')
my data in cvs file
#from this [10101010,10101010] to:
#'01011111001', '11101111111', '11101111011', '11101111011', #'11011111011', '10101111001',
I have CVS file with 1740 rows and 11 col, and i store the data in list of string and convert from 001010101 to string with remove( , )
but i have problem here the orginal algothrtim doesnot work and give me this error
TypeError: object of type '_io.TextIOWrapper' has no len()
and it cannot find the match pattern.
I try use pandas for reading but also give me wrong result
please help me

Related

How to transform a csv file into a multi-dimensional list using Python?

I started out with a 4d list, something like
tokens = [[[["a"], ["b"], ["c"]], [["d"]]], [[["e"], ["f"], ["g"]],[["h"], ["i"], ["j"], ["k"], ["l"]]]]
So I converted this to a csv file using the code
import csv
def export_to_csv(tokens):
csv_list = [["A", "B", "C", word]]
for h_index, h in enumerate(tokens):
for i_index, i in enumerate(h):
for j_index, j in enumerate(i):
csv_list.append([h_index, i_index, j_index, j])
with open('TEST.csv', 'w') as f:
# using csv.writer method from CSV package
write = csv.writer(f)
write.writerows(csv_list)
But now I want to do the reverse process, want to convert a csv file obtained in this format, back to the list format mentioned above.

Assuming you wanted your csv file to look something like this (there were a couple typos in the posted code):
A,B,C,word
0,0,0,a
0,0,1,b
0,0,2,c
...
here's one solution:
import csv
def import_from_csv(filename):
retval = []
with open(filename) as fh:
reader = csv.reader(fh)
# discard header row
next(reader)
# process data rows
for (x,y,z,word) in reader:
x = int(x)
y = int(y)
z = int(z)
retval.extend([[[]]] * (x + 1 - len(retval)))
retval[x].extend([[]] * (y + 1 - len(retval[x])))
retval[x][y].extend([0] * (z + 1 - len(retval[x][y])))
retval[x][y][z] = [word]
return retval

def import_from_csv(file):
import ast
import csv
data = []
# Read the CSV file
with open(file) as fp:
reader = csv.reader(fp)
# Skip the first line, which contains the headers
next(reader)
for line in reader:
# Read the first 3 elements of the line
a, b, c = [int(i) for i in line[:3]]
# When we read it back, everything comes in as strings. Use
# `literal_eval` to convert it to a Python list
value = ast.literal_eval(line[3])
# Extend the list to accomodate the new element
data.append([[[]]]) if len(data) < a + 1 else None
data[a].append([[]]) if len(data[a]) < b + 1 else None
data[a][b].append([]) if len(data[a][b]) < c + 1 else None
data[a][b][c] = value
return data
# Test
assert import_from_csv("TEST.csv") == tokens

First, I'd make writing this construction in a CSV format independent from dimensions:
import csv
def deep_iter(seq):
for i, val in enumerate(seq):
if type(val) is list:
for others in deep_iter(val):
yield i, *others
else:
yield i, val
with open('TEST.csv', 'w') as f:
csv.writer(f).writerows(deep_iter(tokens))
Next, we can use the lexicographic order of the indices to recreate the structure. All we have to do is sequentially move deeper into the output list according to the indices of a word. We stop at the penultimate index to get the last list, because the last index is pointing only at the place of the word in this list and doesn't matter due to the natural ordering:
with open('TEST.csv', 'r') as f:
rows = [*csv.reader(f)]
res = []
for r in rows:
index = r[:-2] # skip the last index and word
e = res
while index:
i = int(index.pop(0)) # get next part of a current index
if i < len(e):
e = e[i]
else:
e.append([]) # add new record at this level
e = e[-1]
e.append(r[-1]) # append the word to the corresponding list

I am trying to copy a large list to an excel spreadsheet in python

The list has 10,000,000 random numbers in it, and I can't figure out how to get the code to go to the next row when I reach the 16,384 column limit.
Current code:
import csv
import random
values = []
for i in range(1,10000000):
n = random.randint(0, 1000000)
values.append(n)
print(n)
file = open('randresults.csv', 'w')
with file:
wr = csv.writer(file, dialect = 'excel')
wr.writerow(values)
file.close()
I tried using wr.writerows(values) but it gives me Error: iterable expected, not float.

After every 16384 values you should write the values array, reset the values array and write it again on the next loop. you need a counter to count the columns
import csv
import random
values = []
file = open('randresults.csv', 'w')
wr = csv.writer(file, dialect = 'excel')
col_count = 0
for i in range(1,10000000):
col_count = col_count + 1
n = random.randint(0, 1000000)
values.append(n)
if col_count == 16384:
wr.writerow(values)
col_count = 0
values = []
# write the remaining values, an incomplete row
wr.writerow(values)
file.close()

Python: Find specific sum of values?

I am trying to find the sum of to values from a text file that is equal to 2020 with python. The text file contains multiple numbers each written in one line like this:
What is the easiest and fastest way to do this in python?
My code reads in the file correctly but does not access the inner for loop?
data = []
try:
file = open('/Users/korbinianschleifer/desktop/input.txt', 'r+')
data = file.readlines()
file.close()
except x:
print('file could not be loaded')
print(len(data))
for i in range(len(data)):
for j in range(i+1,len(data)):
if data[i]+data[j] == 2020:
print('solution found')

Your data list is a list of strings, which means + will concatenate them ("1" + "1" is "11"), which is not the behavior you want. Parse your data to an int:
# use with to avoid having to close the file manually
with open('/Users/korbinianschleifer/desktop/input.txt', 'r+') as file:
data = file.readlines()
# parse it to an int
data = [int(x) for x in data]
for i in range(len(data)):
for j in range(i + 1, len(data)):
if data[i] + data[j] == 2020:
print("solution found", data[i], data[j])
As a side note, the faster way to do this is to maintain a set of values:
# use with to avoid having to close the file manually
with open('/Users/korbinianschleifer/desktop/input.txt', 'r+') as file:
data = file.readlines()
# parse it to an int
data = [int(x) for x in data]
seen = set()
for v in data:
if 2020 - v in seen:
print("solution found", v, 2020 - v)
seen.add(v)

Comparison of two elements in different array

My problem:
I am trying to compare two elements from two different arrays but the operator is not working.
Code Snippet in question:
for i in range(row_length):
print(f"ss_record: {ss_record[i]}")
print(f"row: {row[i + 1]}")
#THIS IF STATEMENT IS NOT WORKING
if ss_record[i] == row[i + 1]:
count += 1
#print()
#print(f"row length: {row_length}")
#print(f"count: {count}")
if count == row_length:
print(row[0])
exit(0)
What I have done: I tried to print the value of ss_record and row before it runs through the if statement but when it matches, count doesn't increase. I tried storing the value of row in a new array but it bugs out and only store the array length and first 2 value of row and repeats those values every next instance.
What I think the issue: I think the issue with my code is that row is being read from a CSV file and is not being converted into an integer as a result, it appears they are the same but one is an integer while the other is a string.
Entire Code:
import csv
import sys
import re
from cs50 import get_string
from sys import argv
def main():
line_count = 0
if len(argv) != 3:
print("missing command-line argument")
exit(1)
with open(sys.argv[1], 'r') as database:
sequence = open(sys.argv[2], 'r')
string = sequence.read()
reader = csv.reader(database, delimiter = ',')
for row in reader:
if line_count == 0:
row_length = len(row) - 1
ss_record = [row_length]
for i in range(row_length):
ss_record.append(ss_count(string, row[i + 1], len(row[i + 1])))
ss_record.pop(0)
line_count = 1
else:
count = 0
for i in range(row_length):
print(f"ss_record: {ss_record[i]}")
print(f"row: {row[i + 1]}")
#THIS IF STATEMENT IS NOT WORKING
if ss_record[i] == row[i + 1]:
count += 1
if count == row_length:
print(row[0])
exit(0)
#ss_count mean the # of times the substring appear in the string
def ss_count(string, substring, length):
count = 1
record = 0
pos_array = []
for m in re.finditer(substring, string):
pos_array.append(m.start())
for i in range(len(pos_array) - 1):
if pos_array[i + 1] - pos_array[i] == length:
count += 1
else:
if count > record:
record = count
count = 1
if count > record:
record = count
return record
main()
Values to use to reproduce issue:
sequence (this is a text file) = AAGGTAAGTTTAGAATATAAAAGGTGAGTTAAATAGAATAGGTTAAAATTAAAGGAGATCAGATCAGATCAGATCTATCTATCTATCTATCTATCAGAAAAGAGTAAATAGTTAAAGAGTAAGATATTGAATTAATGGAAAATATTGTTGGGGAAAGGAGGGATAGAAGG
substring (this is a csv file) =
name,AGATC,AATG,TATC
Alice,2,8,3
Bob,4,1,5
Charlie,3,2,5
Gist of the CSV file:
The numbers beside Alice means how many times a substring(STR/Short Tandem Repeat) appears in a row in the string(DNA sequence). In this string, AGATC appears 4 times in a row, AATG appears 1 time in a row, and TATC appears 5 times in a row. For this DNA sequence, it matches Bob and he outputted as the answer.

You were right, when you compare ss_record[i] == row[i + 1]: there is a type problem, the numbers of ss_record are integers while the numbers of the row are strings. You may acknowledge the issue by printing both ss_record and row:
print("ss_record: {}".format(ss_record)) -> ss_record: [4, 1, 5]
print("row: {}".format(row)) -> row: ['Alice', '2', '8', '3']
In order for the snippet to work you just need to change the comparison to
ss_record[i] == int(row[i + 1])
That said, I feel the code is quite complex for the task. The string class implements a count method that returns the number of non-overlapping occurrences of a given substring. Also, since the code it's working in an item basis and relies heavily in index manipulations the iteration logic is hard to follow (IMO). Here's my approach to the problem:
import csv
def match_user(dna_file, user_csv):
with open(dna_file, 'r') as r:
dna_seq = r.readline()
with open(user_csv, 'r') as r:
reader = csv.reader(r)
rows = list(reader)
target_substrings = rows[0][1:]
users = rows[1:]
num_matches = [dna_seq.count(target) for target in target_substrings]
for user in users:
user_matches = [int(x) for x in user[1:]]
if user_matches == num_matches:
return user[0]
return "Not found"
Happy Coding!

About CS50 Pset6 DNA, it overcounts STR for large.cvs

I'm working on pset6, DNA problem. This code is working for small.cvs but when I try the large one it overestimates the STR count. I guess the problem is when it tries to compare strings. But still don't know how to fix it. I checked that the counting is correct for the "TTTTTTCT" sequence but for the remaining STRs, the counting is in all cases larger than it should.
import sys
import csv
def main():
while (len(sys.argv) != 3):
print ("ERROR. Usage: python dna.py data.csv sequence.txt")
break
list_str = {}
#load the STRs to analyse
with open(sys.argv[1]) as csvfile:
readcsv = csv.reader (csvfile)
ncol = len(next(readcsv))
csvfile.seek(0)
header = list()
for line in readcsv:
a = sum(1 for line in readcsv)
for i in range(ncol):
list_str[line[i]] = 0
header.insert (i, line [i])
print (f"{header[i]}")
#open an work with the sequence file
sequence = open(sys.argv[2], 'r')
seq_r = sequence.read()
for k in list_str.keys():
#print (f"keu {k}")
p = 0
seq = len(seq_r)
while p < seq:
if seq_r[p:(p + len(k))] == k:
list_str[k] += 1
p += len(k)
else: p += 1
#print (f" sequenci encontrada{list_str[k]} y {k}")
print (f"nro de {k} {list_str[k]}")
with open(sys.argv[1]) as csvfile:
readcsv = csv.reader (csvfile)
next(csvfile)
find = False
for row in readcsv:
for j in range(1,ncol):
#print(f"header :{header[j]}")
if int(row [j]) == int(list_str[header[j]]):
print (f"row {row[j]} list {list_str[header[j]]}")
find = True
else:
find = False
break
if find == True: print (f"{row [0]}")
main()

The same thing happened to me, and then I saw the specifications of the pset.
We need to find the "longest run of consecutive repeats of the STR". Not the total count of STRs. It works for the small.csv as in my case too, so try to search for the longest consecutive occurrences of the specific STR.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

naive algorthim string matching - python

Related

How to transform a csv file into a multi-dimensional list using Python?

I am trying to copy a large list to an excel spreadsheet in python

Python: Find specific sum of values?

Comparison of two elements in different array

About CS50 Pset6 DNA, it overcounts STR for large.cvs

Categories

Resources