delete empty quotes and pattern before it in python - python

I have a file as below, whenever there is a key with empty value, I want to delete the key and the empty quotes
My file
<items="20" product="abc" condition="new">
<items="10" product="" condition="new">
<items="50" product="xyz" condition="">
<items="" product="mno" condition="fair">
desired output
<items="20" product="abc" condition="new">
<items="10" condition="new">
<items="50" product="xyz">
<product="mno" condition="fair">
I tried somehting like this, this deleted only the quotes. I want to delete the quotes and the value before "="
f= open('test.txt','r')
A1=f.read()
for i in A1:
if i=="''":
A1.remove(i)
print A1
break

You could use a regular expression:
import re
with open('test.txt','r') as A1:
for i in A1:
print(re.sub('[a-z-]+=\"\" *', '', i))

A possible solution could be:
with open('test.txt','r+') as f:
for line in f:
Line=line[1:len(line)-1]
L=Line.split()
for k in L:
if("" not in k):
f.write(k)
f.write(" ")

You could write a function to pass the lines through:
with open('in_file', 'r') as f:
lines = f.readlines()
def process_line(line):
line = line.split('<')[1].rsplit('>')[0]
valids = [val for val in line.split(' ') if '""' not in val]
line = '<{}>\n'.format(' '.join(valids))
return line
with open('out_file', 'w') as f:
for line in lines:
f.write(process_line(line))

You can use regex,
with open('tmp.txt', 'r') as f_in:
with open('tmp_clean.txt', 'w') as f_outfile:
f_out = csv.writer(f_outfile)
for line in f_in:
line = line.strip()
row = []
if bool(re.search('(.*="")', line)):
line = re.sub('[a-z]+=\"\"', '',line)
row.append(line)
else:
row.append(line)
f_out.writerow(row)

Related

find character at last of line in python file move it to start of line

I have to read one python file and whenever i get } closing curly braces at the end of line move it to start of line
for eg.
input.py
print("hello")
print("shankar")}
required output
print("hello")
}
print("shankar")
current output getting now
print("hello")
}print("shankar")}
code
pattern='}'
f = open('input.py', 'r')
g = open('tempo.py', 'w')
l=f.readlines()
f.close()
for ind in range(len(l)):
if pattern in l[ind]:
l[ind]= "}" + l[ind]
l[ind] = l[ind].rstrip("}")
for field in l:
g.write(field)
g.close()
print('Done')
You may need to consider the newline character:
pattern='}'
f = open('input.py', 'r')
g = open('tempo.py', 'w')
l=f.readlines()
f.close()
for ind in range(len(l)):
if pattern in l[ind]:
l[ind]= "}" + l[ind]
l[ind] = l[ind].replace("}\n","\n")
for field in l:
g.write(field)
g.close()
print('Done')
note: if your line break is windows-like then you may need to do replace ("}\r\n","\r\n")

Printing the lines after matching text

I am trying to print the line that is after text matched in the text file.
Something like this:
import re
afterlines=3
with open(filename, 'r') as f:
for line in f:
if line.strip()== ls_losses:
row = f.readline(+afterlines)
print (row)
print ("true")
I would just use a temporary counter.
import re
afterlines=3
temporary_lines = ''
with open(filename, 'r') as f:
for line in f:
if line.strip() == ls_losses:
counter = afterlines
if counter > 0:
temporary_lines += f.readline()
counter -= 1
else:
print(temporary_lines)
temporary_lines = '' # Reinitialize to get ready for the next match
print ("true")

Reading a specific portion of data from a .txt file in Python

>gene1
ATGATGATGGCG
>gene2
GGCATATC
CGGATACC
>gene3
TAGCTAGCCCGC
This is the text file which I am trying to read.
I want to read every gene in a different string and then add it in a list
There are header lines starting with ’>’ character to recognize if this is a start or end of a gene
with open('sequences1.txt') as input_data:
for line in input_data:
while line != ">":
list.append(line)
print(list)
When printed the list should display list should be
list =["ATGATGATGGCG","GGCATATCCGGATACC","TAGCTAGCCCGC"]
with open('sequences1.txt') as input_data:
sequences = []
gene = []
for line in input_data:
if line.startswith('>gene'):
if gene:
sequences.append(''.join(gene))
gene = []
else:
gene.append(line.strip())
sequences.append(''.join(gene)) # append last gene
print(sequences)
output:
['ATGATGATGGCG', 'GGCATATCCGGATACC', 'TAGCTAGCCCGC']
You have multiple mistakes in your code, look here:
with open('sequences1.txt', 'r') as file:
list = []
for line in file.read().split('\n'):
if not line.startswith(">") and len(line$
list.append(line)
print(list)
Try this:
$ cat genes.txt
>gene1
ATGATGATGGCG
>gene2
GGCATATC
CGGATACC
>gene3
TAGCTAGCCCGC
$ python
>>> genes = []
>>> with open('genes.txt') as file_:
... for line in f:
... if not line.startswith('>'):
... genes.append(line.strip())
...
>>> print(genes)
['ATGATGATGGCG', 'GGCATATC', 'CGGATACC', 'TAGCTAGCCCGC']
sequences1.txt:
>gene1
ATGATGATGGCG
>gene2
GGCATATC
CGGATACC
>gene3
TAGCTAGCCCGC
and then:
desired_text = []
with open('sequences1.txt') as input_data:
content = input_data.readlines()
content = [l.strip() for l in content if l.strip()]
for line in content:
if not line.startswith('>'):
desired_text.append(line)
print(desired_text)
OUTPUT:
['ATGATGATGGCG', 'GGCATATC', 'CGGATACC', 'TAGCTAGCCCGC']
EDIT:
Sped-read it, fixed it with the desired output
with open('sequences1.txt') as input_data:
content = input_data.readlines()
# you may also want to remove empty lines
content = [l.strip() for l in content if l.strip()]
# flag
nextLine = False
# list to save the lines
textList = []
concatenated = ''
for line in content:
find_TC = line.find('gene')
if find_TC > 0:
nextLine = not nextLine
else:
if nextLine:
textList.append(line)
else:
if find_TC < 0:
if concatenated != '':
concatenated = concatenated + line
textList.append(concatenated)
else:
concatenated = line
print(textList)
OUTPUT:
['ATGATGATGGCG', 'GGCATATCCGGATACC', 'TAGCTAGCCCGC']

Is it possible to save the print-output in a dict with python?

I want to know, if it's possible to save the output of this code into a dictionary (maybe it's also the wrong data-type). I'm not expirienced in coding yet, so I can't think of a way it could work.
I want to create a dicitionary that has the lines of the txt.-file in it alongside the value of the corresponding line. In the end, I want to create a code, where the user has the option to search for a word in the line through an input - the output should return the corresponding line. Has anyone a suggestion? Thanks in advance! Cheers!
filepath = 'myfile.txt'
with open(filepath) as fp:
line = fp.readline()
cnt = 1
while line:
print("Line {}: {}".format(cnt, line.strip()))
line = fp.readline()
cnt += 1
This should do it (using the code you provided as a framework, it only takes one extra line to store it in a dictionary):
my_dict={}
filepath = 'myfile.txt'
with open(filepath) as fp:
line = fp.readline()
cnt = 1
while line:
# print("Line {}: {}".format(cnt, line.strip()))
my_dict[str(line.strip())] = cnt
line = fp.readline()
cnt += 1
Then, you can prompt for user input like this:
usr_in = input('enter text to search: ')
print('That text is found at line(s) {}'.format(
[v for k,v in my_dict.items() if usr_in in k]))
For storing the line string value as key in dictionary and line number as value, you can try something like:
filepath = 'myfile.txt'
result_dict = {}
with open(filepath) as fp:
for line_num, line in enumerate(fp.readlines()):
result_dict[line.strip()] = line_num+1
Or, using dictionary comprehension, above code can be:
filepath = 'myfile.txt'
with open(filepath) as fp:
result_dict = {line.strip(): line_num+1
for line_num, line in enumerate(fp.readlines())}
Now to search and return all the lines with words:
search_result = [{key: value} for key, value in result_dict.items()
if search_word in key]

How to format the output of dictionary in python

I would like to format the values of a dictionary in python. Here is the script that i have used to generate the output
entries = {}
entries1 = {}
with open('no_dup.txt', 'r') as fh_in:
for line in fh_in:
if line.startswith('E'):
line = line.strip()
line = line.split()
entry = line[0]
if entry in entries:
entries[entry].append(line)
else:
entries[entry] = [line]
with open('no_dup_out.txt', 'w') as fh_out:
for kee, val in entries.iteritems():
if len(val) == 1:
fh_out.write("{} \n".format(val))
with open('no_dup_out.txt', 'r') as fh_in2:
for line in fh_in2:
line = line.strip()
line = line.split()
entry = line[1]
if entry in entries1:
entries1[entry].append(line)
else:
entries1[entry] = [line]
with open('no_dup_out_final.txt', 'w') as fh_out2:
for kee, val in entries1.iteritems():
if len(val) == 1:
fh_out2.write("{} \n".format(val))
For example by running the above script i generated the following output
[["[['ENSGMOG00000003747',", "'ENSORLG00000006947']]"]]
[["[['ENSGMOG00000003752',", "'ENSORLG00000005385']]"]]
[["[['ENSGMOG00000003760',", "'ENSORLG00000005379']]"]]
[["[['ENSGMOG00000003748',", "'ENSORLG00000004636']]"]]
[["[['ENSGMOG00000003761',", "'ENSORLG00000005382']]"]]
And i would like to format it such as way that i remove all the parentheses and commas (ENSGMOG00000003747 ENSORLG00000006947) and output the rest as it is using tab delimited format. How can i do that?
If your list of lists is full_list, then you could have the following code give your desired output:
desired_list = ['\t'.join([element.split('\'')[1] for element in list_item[0]]) for list_item in full_list]

Categories