Data sorting, combining 2 lines - python

I have a file looking this way:
;1;108/1;4, 109
;1;51;4, 5
;2;109/2;4, 5
;2;108/2;4, 109
;3;108/2;4, 109
;3;51;4, 5
;4;109/2;4, 5
;4;51;4, 5
;5;109/2;4, 5
;5;40/6;5, 6, 7
where
;id1;id2;position_on_shelf_id2
;id1;id3;position_on_shelf_id3
as a result, i want to get:
id1;id2-id3;x
where x are common shelf positions for both id2 and id3, it should look like this
1;108/1-51;4
2;109/2-108/2;4
3;108/2-51;4
4;109/2-51;4, 5
5;109/2-40/6;5
my script works fine up to the moment where I need to type common shelf positions. I tried using .intersection, but it is not working properly, when I have positions consisting of double characters (pos:144-result: 14; pos:551, result: 51; pos:2222-result: 2 i.e)
result = id2_chars.intersection(id3_chars)
any fix for intersection? or maybe some better method on your mind?
code so far:
part1 - merge every 2nd line together
exp = open('output.txt', 'w')
with open("dane.txt") as f:
content = f.readlines()
strng = ""
for i in range(1,len(content)+1):
strng += content[i-1].strip()
if i % 2 == 0:
exp.writelines(strng + '\n')
strng = ""
exp.close()
part2 - intersection:
exp = open('output2.txt', 'w')
imp = open('output.txt')
for line in imp:
none, lp1, dz1, poz1, lp2, dz2, poz2 = line.split(';')
s1 = poz1.lower()
s2 = poz2.lower()
s1_chars = set(s1)
s2_chars = set(s2)
result = s1_chars.intersection(s2_chars)
result = str(result)
exp.writelines(lp1 + ';' + dz1 + '-' + dz2 + ';' + result + '\n')
exp.close()
** i did not filtered the result for my needs yet (it is in "list" form), but it won't be a problem once I get the right intersection result

Your main problem is that you try to intersect 2 sets of characters while you should intersect positions. So you should at least use:
...
s1 = poz1.lower()
s2 = poz2.lower()
s1_poz= set(x.strip() for x in s1.split(','))
s2_poz = set(x.strip() for x in s1.split(','))
result = s1_poz.intersection(s2_poz)
result = ', '.join(result)
...
But in fact, you could easily do the whole processing in one single pass:
exp = open('output.txt', 'w')
with open("dane.txt") as f:
old = None
for line in f: # one line at a time is enough
line = line.strip()
if old is None: # first line of a block, just store it
old = line
else: # second line of a bock, process both
none, lp1, dz1, poz1 = old.split(';')
none, lp2, dz2, poz2 = line.split(';')
poz1x = set(x.strip() for x in poz1.tolower().split(','))
poz2x = set(x.strip() for x in poz2.tolower().split(','))
result = ', '.join(poz1x.intersection(poz2x))
exp.write(lp1 + ';' + dz1 + '-' + dz2 + ';' + result + '\n')
old = None

Related

Python For Loop Count and Insert Numbers beside words

I have a problem regarding Python, I want to count the list item and then put a number beside the value of text.
This is the output I want:
test = 1
me = 2
texting = 3
This is the output I always get:
test = 3
me = 3
texting = 3
Here is my line of code:
text = request.form['title']
text2 = text.splitlines()
count = len(text2)
textarray = []
x = 0;
while(x <count):
for txt in text2:
textarray = [txt + " = " + str(x) for txt in text2]
x = x+1
string = '<br>'.join(textarray)
return render_template('index.html', text=string)
Fix
You don't need 2 loops, just iterate over text2 and increment your xn then append to the array, don't recreate it wit nonsense
textarray = []
x = 1
for txt in text2:
textarray.append(txt + " = " + str(x))
x = x + 1
Improve
Use enumerate to generate increasing value along with an iterable
textarray = []
for idx, txt in enumerate(text2, 1):
textarray.append(f"{txt} = {idx}")
Best
Use generator construction and inline it
text = "test\nme\ntexting"
result = '</br>'.join(
(f"{word}={idx}" for idx, word in enumerate(text.splitlines(), 1))
)
# return render_template('index.html', text=result)

Python code for optimal alignment score and sequence giving wrong result

This is my first time coding, so please do understand my code is very messy. I have done two different ways to get the optimal score and the optimal sequence, unfortunately both of my answers are wrong. In my code I have included a way to open a fasta file, but since this seemed to not work I also just included the sequences in the code myself.
My optimal score is computed but not printed for some reason- it is also wrong I have 208 when I should get 275. I also dont get a correct alignment score back.
The two sequences are
The scoring alignment needs to follow , 11 for internal gaps, 8 for terminal gaps on the 5' end, 7 for gaps on the 3' end, 4 for mismatches, 0 for matches
My file is at [removed link]
my_file = open("one.fasta","w")
my_file.write (""">Testseq1
TCTGGTGTCCTAGGCGTAGAGGAACCACACCAATCCATCCCGAACTCTGGTGGTTAAACTCTACTGCGGTGACGATACT""")
sequenceone= open("one.fasta","r")
line = sequenceone.readline()
header = ""
seqA = ""
while line:
line = line.rstrip("\n")
if ">" in line:
header = line
else :
seqA = seqA + line
line = sequenceone.readline()
my_file.close()
my_files = open("two.fasta","w")
my_files.write (""">Testseq2
TGGTGCGGTCATACCAGCGCTAATGCACCGGATCCCATCAGAACTCCGCAGTTAAGCGCGCTTGGGCCAGAACAGTACTGGGATGGGTGTCC""")
sequencetwo= open("two.fasta","r")
line = sequencetwo.readline()
header = ""
seqB = ""
while line:
line = line.rstrip("\n")
if ">" in line:
header = line
else :
seqB = seqB + line
line = sequencetwo.readline()
my_files.close()
alphabet = ["A","C","G","T"]
score = [[8,8,8,8,8],\
[0,4,4,4,11],\
[4,0,4,4,11],\
[4,4,0,4,11],\
[4,4,4,0,11],\
[7,7,7,7,7]]
def Global(a,b):
D = []
for i in range(len(a)+1):
D.append([0]* (len(b)+1))
for i in range(len(a)+1):
D[i][0] = D[i-1][0] + score[alphabet.index(a[i-1])][-1]
for i in range(len(b)+1):
D[0][i] = D[0][i-1] + score[-1][alphabet.index(b[i-1])]
for i in range (1, len(a)+1):
for j in range (1, len(b)+1):
distHor = D[i][j-1] + score[-1][alphabet.index(b[j-1])]
distVer = D[i-1][j] + score[alphabet.index(a[i-1])][-1]
if a[i-1] == b[j-1]:
distDiag = D[i-1][j-1]
else:
distDiag = D[i-1][j-1] + score[alphabet.index(a[i-1])][alphabet.index(b[j-1])]
D[i][j] = min(distHor, distVer, distDiag)
return D[-1][-1]
seqA = "TCTGGTGTCCTAGGCGTAGAGGAACCACACCAATCCATCCCGAACTCTGGTGGTTAAACTCTACTGCGGTGACGATACT"
seqB = "TGGTGCGGTCATACCAGCGCTAATGCACCGGATCCCATCAGAACTCCGCAGTTAAGCGCGCTTGGGCCAGAACAGTACTGGGATGGGTGTCC"
row = len(seqA)+1
column = len(seqB)+1
match = 0
mismatch = 4
gap = 11
align1=""
align2=""
matrix=[[[[None] for i in range (2)] for i in range(column)] for i in range(row)]
for i in range(column):
matrix[0][i][0]=gap*i
if(i>0):
matrix[0][i][1]="hor"
for i in range(row):
matrix[i][0][0]=gap*i
if(i>0):
matrix[i][0][1]="ver"
for i in range(1,row):
for j in range(1,column):
hor=matrix[i][j-1][0]+gap
ver=matrix[i-1][j][0]+gap
if (seqA[i-1]==seqB[j-1]):
diag=matrix[i-1][j-1][0]+match
else:
diag=matrix[i-1][j-1][0]+mismatch
var = {hor:"hor",ver:"ver",diag:"diag"}
hvd=[hor,ver,diag]
matrix[i][j][0]=max(hvd)
matrix[i][j][1]=var.get(max(var))
k=row
l=column
while(True):
if(l==1 and k==1):
break
else:
if(matrix[k-1][l-1][1]=="ver"):
align1+=seqA[k-2]
align2+="-"
k-=1
elif(matrix[k-1][l-1][1]=="hor"):
align1+="-"
align2+=seqB[l-2]
l-=1
elif(matrix[k-1][l-1][1]=="diag"):
align1+=seqA[k-2]
align2+=seqB[l-2]
k-=1
l-=1
align1=align1[::-1]
align2=align2[::-1]
print (align1)
print (align2)
Global(seqA,seqB)
Please can anyone guide me on what I am doing wrong?

Extracting data from a text file to an output file

I have alot of files which names are just number. (Starting from 1 to whatever is the maximum number) and each of these files are similar to each other by their "tags" (ObjectID =, X =, Y =, etc.), but the values after those tags are not the same at all.
I wanted to make my job easier from manually copy/pasting the data from one file to another and made a small script using Python (since I am slightly experienced in it).
This is the full script:
import os
BASE_DIRECTORY = 'C:\Users\Tom\Desktop\TheServer\scriptfiles\Objects'
output_file = open('output.txt', 'w')
output = {}
file_list = []
for (dirpath, dirnames, filenames) in os.walk(BASE_DIRECTORY):
for f in filenames:
if 'txt' in str(f):
e = os.path.join(str(dirpath), str(f))
file_list.append(e)
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
if 'ObjectID =' in line:
output[f].append(line)
elif 'X =' in line:
output[f].append(line)
elif 'Y =' in line:
output[f].append(line)
tabs = []
for tab in output:
tabs.append(tab)
tabs.sort()
for tab in tabs:
for row in output[tab]:
output_file.write(row + '')
Now, everything is working fine, the output file looks like this:
ObjectID = 1216
X = -1480.500610
Y = 2610.885742
ObjectID = 970
X = -1517.210693
Y = 2522.842285
ObjectID = 3802
X = -1512.156616
Y = 2521.116210
etc.
But I don't want it to be like that (each value has a new line). I need it to do this for every file:
Read the file.
Remove the tags infront of the values.
Format a single line which will have those values in the output folder. (Let's say I want to make it look like this: "(1216,-1480.500610,2522.842285)" )
Write that line in the output folder.
Repeat for every file.
Any help please?
Hope this helps.
data = open('sam.txt', 'r').read()
>>> print data
ObjectID = 1216
X = -1480.500610
Y = 2610.885742
ObjectID = 970
X = -1517.210693
Y = 2522.842285
ObjectID = 3802
X = -1512.156616
Y = 2521.116210
>>>
Now lets do some string replacements :)
>>> data = data.replace('ObjectID =', '').replace('\nX = ', ',').replace('\nY = ', ',')
>>> print data
1216,-1480.500610,2610.885742
970,-1517.210693,2522.842285
3802,-1512.156616,2521.116210
In your loop, keep track of whether you are 'in' a record:
records = []
in_record = False
id, x, y = 0, 0, 0
for line in txtfile:
if not in_record:
if 'ObjectID =' in line:
in_record = True
id = line[10:]
elif 'X =' in line:
x = line[3:]
elif 'Y =' in line:
y = line[3:]
records.append((id, x, y))
in_record = False
Then you'll have a list of tuples which you can easily write with the csv module.
Find here a version of the loop you have generating the contents.
I rewrote it so the line contents ObjectId, X and Y are in the same line.
It looks that is what you want to do:
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
myline = ''
if 'ObjectID =' in line:
pos = line.rfind("ObjectID =") + len("ObjectID =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
elif 'X =' in line:
pos = line.rfind("X =") + len("X =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
elif 'Y =' in line:
pos = line.rfind("Y =") + len("Y =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
output[f].append(myline)
Note that you need to know which character (in the code the delimiter) separates the names you try to find: ObjectID = from the actual values you want to grab from the line.
Here is what you need. I did not have enough time to write the code for appending the result to a new file. Instead it just prints it, but you get the point.
import os.path
path = "path"
#getting the number of files in your folder
num_files = len([f for f in os.listdir(path)
if os.path.isfile(os.path.join(path, f))])
#function that returns your desired output for a given file
def file_head_ext(file_path, file_num):
with open(file_path + "/" + file_num) as myfile:
head = [next(myfile).split("=") for x in range(3)]
formatted_head = [elm[1].replace("\n",'').replace(" ","") for elm in head]
return(",".join(formatted_head))
for filnum in range(1,num_files):
print(file_head_ext(path, str(filnum)))

Writing a list from a for loop into a csv

I wrote a for loop that iterates through a CSV to get a list like this:
[t1, s1]
[t2, s2]
[t3, s3]
and so 4 thousand times.
Now I need to write these into a new CSV file, where they'd populate 2 fields and be separated by a comma.
When I enter this, I only get the last list from the last loop, and with one character in a cell.
def sentiment_analysis():
fo = open("positive_words.txt", "r")
positive_words = fo.readlines()
fo.close()
positive_words = map(lambda positive_words: positive_words.strip(), positive_words)
fo = open("negative_words.txt", "r")
negative_words = fo.readlines()
fo.close()
negative_words = map(lambda negative_words: negative_words.strip(), negative_words)
fo = open("BAC.csv", "r")
data = fo.readlines()
fo.close()
data = map(lambda data: data.strip(), data)
x1 = 0 #number of bullish
x2 = 0 #number of bearish
x3 = 0 #number of unknown
for info in data:
data_specs = info.split(',')
time_n_date = data_specs[0]
sentiment = data_specs[2]
'''Possibly precede with a nested for loop for data_specs???'''
if sentiment == 'Bullish':
'''fo.write(time + ',' + 'Bullish' + '\n')'''
elif sentiment == 'Bearish':
''' fo.write(time + ',' + 'Bearish' + '\n')'''
else:
x3 += 1
positive = 0
negative = 0
content_words = data_specs[1].split()
for a in positive_words:
for b in content_words:
if (a == b):
positive = positive + 1
for c in negative_words:
for d in content_words:
if (c == d):
negative = negative + 1
if positive > negative:
'''fo.write(time + ',' + 'Bullish' + '\n')'''
sentiment = 'Bullish'
elif positive < negative:
sentiment = 'Bearish'
else:
sentiment = 'Neutral'
bac2data = [time_n_date, sentiment]
print bac2data
fo = open("C:\Users\Siddhartha\Documents\INFS 772\Project\Answer\BAC2_answer.csv", "w")
for x in bac2data:
w = csv.writer(fo, delimiter = ',')
w.writerows(x)
fo.close()
My for loop isn't going through it all.
In your code bac2data = [time_n_date, sentiment] creates a list containing 2 string items. The proper way to write that to a CSV file with csv.writer() is with writerow(bac2data).
The last part of your code contains a number of errors. Firstly you are opening the CSV file in write mode ('w') for every line of the incoming data. This will overwrite the file each time, losing all data except the last line. Then you are iterating over the bac2data list and calling writerows() on each item. That's going to write each character from the string on it's own line (which matches your reported output).
Instead, open the output file and create a csv.writer outside of the main for info in data: loop:
fo = open("C:\Users\Siddhartha\Documents\INFS 772\Project\Answer\BAC2_answer.csv", "w")
writer = csv.writer(fo)
for info in data:
....
Then replace these lines at the bottom of the main loop:
bac2data = [time_n_date, sentiment]
print bac2data
fo = open("C:\Users\Siddhartha\Documents\INFS 772\Project\Answer\BAC2_answer.csv", "w")
for x in bac2data:
w = csv.writer(fo, delimiter = ',')
w.writerows(x)
fo.close()
with this:
bac2data = [time_n_date, sentiment]
print bac2data
writer.writerow(bac2data)
Once you have that working, and no longer need to print bac2data for debugging, you can just use 1 line:
writer.writerow((time_n_date, sentiment)]
Update
Complete code for function:
def sentiment_analysis():
fo = open("positive_words.txt", "r")
positive_words = fo.readlines()
fo.close()
positive_words = map(lambda positive_words: positive_words.strip(), positive_words)
fo = open("negative_words.txt", "r")
negative_words = fo.readlines()
fo.close()
negative_words = map(lambda negative_words: negative_words.strip(), negative_words)
fo = open("BAC.csv", "r")
data = fo.readlines()
fo.close()
data = map(lambda data: data.strip(), data)
x1 = 0 #number of bullish
x2 = 0 #number of bearish
x3 = 0 #number of unknown
fo = open("C:\Users\Siddhartha\Documents\INFS 772\Project\Answer\BAC2_answer.csv", "w")
writer = csv.writer(fo)
for info in data:
data_specs = info.split(',')
time_n_date = data_specs[0]
sentiment = data_specs[2]
'''Possibly precede with a nested for loop for data_specs???'''
if sentiment == 'Bullish':
'''fo.write(time + ',' + 'Bullish' + '\n')'''
elif sentiment == 'Bearish':
''' fo.write(time + ',' + 'Bearish' + '\n')'''
else:
x3 += 1
positive = 0
negative = 0
content_words = data_specs[1].split()
for a in positive_words:
for b in content_words:
if (a == b):
positive = positive + 1
for c in negative_words:
for d in content_words:
if (c == d):
negative = negative + 1
if positive > negative:
'''fo.write(time + ',' + 'Bullish' + '\n')'''
sentiment = 'Bullish'
elif positive < negative:
sentiment = 'Bearish'
else:
sentiment = 'Neutral'
bac2data = [time_n_date, sentiment]
print bac2data
writer.writerow(bac2data)
fo.close()

Is there some kind of limit to the amount of output Python 3.4 allows using the write() method at one time?

I put trailing print() methods right next to my write() method lines at the end of my code to test why my output files were incomplete. But, the print() output is "all the stuff" I expect; while the write() output is off by a confusing amount (only 150 out of 200 'things'). Reference Image of Output: IDLE versus external output file
FYI: Win 7 64 // Python 3.4.2
My modules take an SRT captions file ('test.srt') and returns a list object I create from it; in particular, one with 220 list entries of the form: [[(index), [time], string]]
times = open('times.txt', 'w')
### A portion of Riobard's SRT Parser: srt.py
import re
def tc2ms(tc):
''' convert timecode to millisecond '''
sign = 1
if tc[0] in "+-":
sign = -1 if tc[0] == "-" else 1
tc = tc[1:]
TIMECODE_RE = re.compile('(?:(?:(?:(\d?\d):)?(\d?\d):)?(\d?\d))?(?:[,.](\d?\d?\d))?')
match = TIMECODE_RE.match(tc)
try:
assert match is not None
except AssertionError:
print(tc)
hh,mm,ss,ms = map(lambda x: 0 if x==None else int(x), match.groups())
return ((hh*3600 + mm*60 + ss) * 1000 + ms) * sign
# my code
with open('test.srt') as f:
file = f.read()
srt = []
for line in file:
splitter = file.split("\n\n")
# SRT splitter
i = 0
j = len(splitter)
for items in splitter:
while i <= j - 2:
split_point_1 = splitter[i].index("\n")
split_point_2 = splitter[i].index("\n", split_point_1 + 1)
index = splitter[i][:split_point_1]
time = [splitter[i][split_point_1:split_point_2]]
time = time[0][1:]
string = splitter[i][split_point_2:]
string = string[1:]
list = [[(index), [time], string]]
srt += list
i += 1
# time info outputter
i = 0
j = 1
for line in srt:
if i != len(srt) - 1:
indexer = srt[i][1][0].index(" --> ")
timein = srt[i][1][0][:indexer]
timeout = srt[i][1][0][-indexer:]
line_time = (tc2ms(timeout) - tc2ms(timein))/1000
space_time = ((tc2ms((srt[j][1][0][:indexer]))) - (tc2ms(srt[i][1][0][-indexer:])))/1000
out1 = "The space between Line " + str(i) + " and Line " + str(j) + " lasts " + str(space_time) + " seconds." + "\n"
out2 = "Line " + str(i) + ": " + str(srt[i][2]) + "\n\n"
times.write(out1)
times.write(out2)
print(out1, end="")
print(out2)
i += 1
j += 1
else:
indexer = srt[i][1][0].index(" --> ")
timein = srt[i][1][0][:indexer]
timeout = srt[i][1][0][-indexer:]
line_time = (tc2ms(timeout) - tc2ms(timein))/1000
outend = "Line " + str(i) + ": " + str(srt[i][2]) + "\n<End of File>"
times.write(outend)
print(outend)
My two write() method output files, respectively, only print out either ~150 or ~200 items of the 220 things it otherwise correctly prints to the screen.
You want to close your times file when done writing; operating systems use write buffers to speed up file I/O, collecting larger blocks of data to be written to disk in one go; closing the file flushes that buffer:
times.close()
Consider opening the file in a with block:
with open('times.txt', 'w') as times:
# all code that needs to write to times

Categories