I would like to format the values of a dictionary in python. Here is the script that i have used to generate the output
entries = {}
entries1 = {}
with open('no_dup.txt', 'r') as fh_in:
for line in fh_in:
if line.startswith('E'):
line = line.strip()
line = line.split()
entry = line[0]
if entry in entries:
entries[entry].append(line)
else:
entries[entry] = [line]
with open('no_dup_out.txt', 'w') as fh_out:
for kee, val in entries.iteritems():
if len(val) == 1:
fh_out.write("{} \n".format(val))
with open('no_dup_out.txt', 'r') as fh_in2:
for line in fh_in2:
line = line.strip()
line = line.split()
entry = line[1]
if entry in entries1:
entries1[entry].append(line)
else:
entries1[entry] = [line]
with open('no_dup_out_final.txt', 'w') as fh_out2:
for kee, val in entries1.iteritems():
if len(val) == 1:
fh_out2.write("{} \n".format(val))
For example by running the above script i generated the following output
[["[['ENSGMOG00000003747',", "'ENSORLG00000006947']]"]]
[["[['ENSGMOG00000003752',", "'ENSORLG00000005385']]"]]
[["[['ENSGMOG00000003760',", "'ENSORLG00000005379']]"]]
[["[['ENSGMOG00000003748',", "'ENSORLG00000004636']]"]]
[["[['ENSGMOG00000003761',", "'ENSORLG00000005382']]"]]
And i would like to format it such as way that i remove all the parentheses and commas (ENSGMOG00000003747 ENSORLG00000006947) and output the rest as it is using tab delimited format. How can i do that?
If your list of lists is full_list, then you could have the following code give your desired output:
desired_list = ['\t'.join([element.split('\'')[1] for element in list_item[0]]) for list_item in full_list]
Related
I have a file in1.txt
info="0x0000b573" data="0x7" id="sp. PCU(Si)"
info="0x0000b573" data="0x00000007" id="HI all. SHa"
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
info="0x205" data="0x00000010" id="cgc_15. PK"
info="0x205" data="0x10" id="cgsd_GH/BS (Scd)"
Expected output: out.txt
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
I need only lines that have same info values and different data values to be written to out.txt.
but the current code removes all the line that have string data in it.
with open("in.txt", "r") as fin,open("out.txt", "w") as fout:
for line in fin:
if 'data' not in line:
fout.write(line.strip()+'\n')
what i need is for eg: line 1 and line 2 is having same info="0x0000b573" and data is "0x7" & "0x00000007" which is same then remove that line.
You can use regex
import re
s = '''info="0x0000b573" data="0x7" id="sp. PCU(Si)"
info="0x0000b573" data="0x00000007" id="HI all. SHa"
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
info="0x205" data="0x00000010" id="cgc_15. PK"
info="0x205" data="0x10" id="cgsd_GH/BS (Scd)"'''
parsed_data = re.findall(r'info="([^"]+)" data="([^"]+)" id="[^"]+"', s, re.MULTILINE)
parsed_data = sorted([list(map(lambda x: int(x, 16), i)) + [index] for index,i in enumerate(parsed_data)])
row_numbers = [j for i in [[parsed_data[i][-1], parsed_data[i+1][-1]] for i in range(0,len(parsed_data),2) if parsed_data[i][1] != parsed_data[i+1][1]] for j in i]
final_output = []
for index,line in enumerate(s.split('\n')):
if index in row_numbers:
final_output.append(line)
final_out_text = '\n'.join(final_output)
print(final_out_text)
# info="0x00010AC3" data="0x00000003" id="abc_16. PS"
# info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
You could try something like that too, I think
#!/usr/bin/python3
records = {}
items = []
info = []
data = []
with open("in.dat", "r") as fin:
for line in fin:
items=line.split(' ')
info = items[0].split('=')
data = items[1].split('=')
try:
key = info[1].strip('"').lower()
value = str(int(data[1].strip('"'), 16))
records[key][value] += 1
except KeyError:
try:
records[key][value] = 1
except KeyError:
records[key] = {value: 1}
out = dict()
for key in records:
for value in records[key]:
if records[key][value] == 1:
try:
out[key].append(value)
except KeyError:
out[key] = [value]
with open("out.dat", "w") as fout:
for key in out:
for value in out[key]:
fout.write(f"{key}={value}\n")
Something like this could work:
found_info_values = []
with open("in.txt", "r") as fin,open("out.txt", "w") as fout:
for line in fin:
info = line.split('"')[1]
if info not in found_info_values:
fout.write(line.strip()+'\n')
found_info_values += info
>gene1
ATGATGATGGCG
>gene2
GGCATATC
CGGATACC
>gene3
TAGCTAGCCCGC
This is the text file which I am trying to read.
I want to read every gene in a different string and then add it in a list
There are header lines starting with ’>’ character to recognize if this is a start or end of a gene
with open('sequences1.txt') as input_data:
for line in input_data:
while line != ">":
list.append(line)
print(list)
When printed the list should display list should be
list =["ATGATGATGGCG","GGCATATCCGGATACC","TAGCTAGCCCGC"]
with open('sequences1.txt') as input_data:
sequences = []
gene = []
for line in input_data:
if line.startswith('>gene'):
if gene:
sequences.append(''.join(gene))
gene = []
else:
gene.append(line.strip())
sequences.append(''.join(gene)) # append last gene
print(sequences)
output:
['ATGATGATGGCG', 'GGCATATCCGGATACC', 'TAGCTAGCCCGC']
You have multiple mistakes in your code, look here:
with open('sequences1.txt', 'r') as file:
list = []
for line in file.read().split('\n'):
if not line.startswith(">") and len(line$
list.append(line)
print(list)
Try this:
$ cat genes.txt
>gene1
ATGATGATGGCG
>gene2
GGCATATC
CGGATACC
>gene3
TAGCTAGCCCGC
$ python
>>> genes = []
>>> with open('genes.txt') as file_:
... for line in f:
... if not line.startswith('>'):
... genes.append(line.strip())
...
>>> print(genes)
['ATGATGATGGCG', 'GGCATATC', 'CGGATACC', 'TAGCTAGCCCGC']
sequences1.txt:
>gene1
ATGATGATGGCG
>gene2
GGCATATC
CGGATACC
>gene3
TAGCTAGCCCGC
and then:
desired_text = []
with open('sequences1.txt') as input_data:
content = input_data.readlines()
content = [l.strip() for l in content if l.strip()]
for line in content:
if not line.startswith('>'):
desired_text.append(line)
print(desired_text)
OUTPUT:
['ATGATGATGGCG', 'GGCATATC', 'CGGATACC', 'TAGCTAGCCCGC']
EDIT:
Sped-read it, fixed it with the desired output
with open('sequences1.txt') as input_data:
content = input_data.readlines()
# you may also want to remove empty lines
content = [l.strip() for l in content if l.strip()]
# flag
nextLine = False
# list to save the lines
textList = []
concatenated = ''
for line in content:
find_TC = line.find('gene')
if find_TC > 0:
nextLine = not nextLine
else:
if nextLine:
textList.append(line)
else:
if find_TC < 0:
if concatenated != '':
concatenated = concatenated + line
textList.append(concatenated)
else:
concatenated = line
print(textList)
OUTPUT:
['ATGATGATGGCG', 'GGCATATCCGGATACC', 'TAGCTAGCCCGC']
I want to know, if it's possible to save the output of this code into a dictionary (maybe it's also the wrong data-type). I'm not expirienced in coding yet, so I can't think of a way it could work.
I want to create a dicitionary that has the lines of the txt.-file in it alongside the value of the corresponding line. In the end, I want to create a code, where the user has the option to search for a word in the line through an input - the output should return the corresponding line. Has anyone a suggestion? Thanks in advance! Cheers!
filepath = 'myfile.txt'
with open(filepath) as fp:
line = fp.readline()
cnt = 1
while line:
print("Line {}: {}".format(cnt, line.strip()))
line = fp.readline()
cnt += 1
This should do it (using the code you provided as a framework, it only takes one extra line to store it in a dictionary):
my_dict={}
filepath = 'myfile.txt'
with open(filepath) as fp:
line = fp.readline()
cnt = 1
while line:
# print("Line {}: {}".format(cnt, line.strip()))
my_dict[str(line.strip())] = cnt
line = fp.readline()
cnt += 1
Then, you can prompt for user input like this:
usr_in = input('enter text to search: ')
print('That text is found at line(s) {}'.format(
[v for k,v in my_dict.items() if usr_in in k]))
For storing the line string value as key in dictionary and line number as value, you can try something like:
filepath = 'myfile.txt'
result_dict = {}
with open(filepath) as fp:
for line_num, line in enumerate(fp.readlines()):
result_dict[line.strip()] = line_num+1
Or, using dictionary comprehension, above code can be:
filepath = 'myfile.txt'
with open(filepath) as fp:
result_dict = {line.strip(): line_num+1
for line_num, line in enumerate(fp.readlines())}
Now to search and return all the lines with words:
search_result = [{key: value} for key, value in result_dict.items()
if search_word in key]
I have many lines like the following:
>ENSG00000003137|ENST00000001146|CYP26B1|72374964|72375167|4732
CGTCGTTAACCGCCGCCATGGCTCCCGCAGAGGCCGAGT
>ENSG00000001630|ENST00000003100|CYP51A1|91763679|91763844|3210
TCCCGGGAGCGCGCTTCTGCGGGATGCTGGGGCGCGAGCGGGACTGTTGACTAAGCTTCG
>ENSG00000003137|ENST00000412253|CYP26B1|72370133;72362405|72370213;72362548|4025
AGCCTTTTTCTTCGACGATTTCCG
In this example ENSG00000003137 is name and 4732 which is the last one is length. as you see some names are repeated but they have different length.
I want to make a new file in which I only have those with the longest length. meaning the results would be like this:
>ENSG00000003137|ENST00000001146|CYP26B1|72374964|72375167|4732
CGTCGTTAACCGCCGCCATGGCTCCCGCAGAGGCCGAGT
>ENSG00000001630|ENST00000003100|CYP51A1|91763679|91763844|3210
TCCCGGGAGCGCGCTTCTGCGGGATGCTGGGGCGCGAGCGGGACTGTTGACTAAGCTTCG
I have made this code to split but don't know how to make the file I want:
file = open(“file.txt”, “r”)
for line in file:
if line.startswith(“>”):
line = line.split(“|”)
You'll need to read the file twice; the first time round, track the largest size per entry:
largest = {}
with open(inputfile) as f:
for line in f:
if line.startswith('>'):
parts = line.split('|')
name, length = parts[0][1:], int(parts[-1])
largest[name] = max(length, largest.get(name, -1))
then write out the copy in a second pass, but only those sections whose name and length match the extracted largest length from the first pass:
with open(inputfile) as f, open(outpufile, 'w') as out:
copying = False
for line in f:
if line.startswith('>'):
parts = line.split('|')
name, length = parts[0][1:], int(parts[-1])
copying = largest[name] == length
if copying:
out.write(line)
you have to do two types of handling in the loop, one that compares your 'length', and one that stores the CGTA when its needed. I wrote an example for you that reads those into dicts:
file = open("file.txt", "r")
myDict = {}
myValueDict = {}
action = 'remember'
geneDict = {}
for line in file:
if line.startswith(">"):
line = line.rstrip().split("|")
line_name = line[0]
line_number = int(line[-1])
if line_name in myValueDict:
if myValueDict[line_name] < line_number:
action = 'remember'
myValueDict[line_name] = line_number
myDict[line_name] = line
else:
action = 'forget'
else:
myDict[line_name] = line
myValueDict[line_name] = line_number
else:
if action == 'remember':
geneDict[line_name] = line.rstrip()
for key in myDict:
print(myDict[key])
for key in geneDict:
print(geneDict[key])
this ignores the lower length items. you can now store those dicts any way you want.
I want to go through each line of the a csv file and compare to see if the first field of line 1 is same as first field of next line and so on. If it finds a match then i would like ignore those two lines that contains the same fields and keep the lines where there is no match
Here is an example dataset (no_dup.txt)
Ac_Gene_ID M_Gene_ID
ENSGMOG00000015632 ENSORLG00000010573
ENSGMOG00000015632 ENSORLG00000010585
ENSGMOG00000003747 ENSORLG00000006947
ENSGMOG00000003748 ENSORLG00000004636
Basically i want to exclude line 1 and 2 since they contains the same fields (ENSGMOG00000015632) and keep lines 3 and 4
Here is the code i have tried but couldn't finish it
prev = None
with open("no_dup.txt", 'r') as fh_in:
for line in fh_in:
line = line.strip()
if line.startswith("E"):
line1 = line.split()
print "initial gene =", line1[0]
if prev is not None or prev!= line1[0]:
prev = line1[0]
I think a clean way of doing this would be to make a map of each entry -> list of lines.
entries = {}
with open('no_dup.txt', 'r') as fh_in:
for line in fg_in:
entry = line.split()[0]
if entry in entries:
entries[entry].append(line)
else:
entries[entry] = [line]
for matches in entries.iteritems():
if len(matches) == 1:
print matches[0]
You should note that this will NOT preserve the order of entries.
Your start looks good:
def filter_dups(iterable):
prev = None
for line in iterable:
if line.startswith("E"):
if prev.split(None, 1)[0] == line.split(None, 1)[0]:
prev = None
else:
if prev is not None:
yield prev
else:
prev = line
else:
yield line
prev = None
if prev is not None:
yield prev
with open("no_dup.txt", 'r') as fh_in:
with open("no_dup_out.txt", 'r') as fh_out:
fh_out.writelines(filter_dups(fh_in))
You can use this:
with open('a.txt','r') as inputFile:
lines = inputFile.readlines()
prev = lines[0]
for i in range(1, len(lines)):
cur = lines[i]
if prev.split()[0] != cur.split()[0]:
print prev.strip()
prev = cur
print lines[-1].strip()
Input:
ENSGMOG00000015632 ENSORLG00000010573
ENSGMOG00000015632 ENSORLG00000010585
ENSGMOG00000003747 ENSORLG00000006947
ENSGMOG00000003748 ENSORLG00000004636
Output:
ENSGMOG00000015632 ENSORLG00000010585
ENSGMOG00000003747 ENSORLG00000006947
ENSGMOG00000003748 ENSORLG00000004636