How to get value from table text file by Python? - python

i'm trying get value from table text file.
filename = open('D:\THESIS\DATA\outputData\covarince.txt', 'r')
for n in xrange(21):
next(filename)
#for line in filename:
for line in filename:
line = line.strip()
word = line.split()
print word [5:9]
but result ....
['1.858598e+007', '1.771380e+007', '1.680333e+007', '3.094793e+007']
['1.755510e+007', '1.675123e+007', '1.592444e+007', '2.924262e+007']
['1.667081e+007', '1.593449e+007', '1.525907e+007', '2.744351e+007']
['3.140037e+007', '2.997102e+007', '2.821130e+007', '5.446339e+007']
['2.610668e+007', '2.504934e+007', '2.423942e+007', '4.068118e+007']
['2.504934e+007', '2.410118e+007', '2.337335e+007', '3.907932e+007']
['2.423942e+007', '2.337335e+007', '2.292371e+007', '3.696649e+007']
['4.068118e+007', '3.907932e+007', '3.696649e+007', '7.047854e+007']
[]
[]
[]
[]
[]
['4', '5', '6', '7']
[]
['0.83751', '0.83075', '0.80804', '0.84876']
['0.83533', '0.82958', '0.80864', '0.84687']
['0.82648', '0.82219', '0.80731', '0.82806']
['0.81225', '0.80689', '0.77877', '0.85745']
['1.00000', '0.99862', '0.99084', '0.94839']
['0.99862', '1.00000', '0.99440', '0.94820']
['0.99084', '0.99440', '1.00000', '0.91968']
['0.94839', '0.94820', '0.91968', '1.00000']
[]
i want only value in yellow highlight:

This should get you started
filepath = 'D:\THESIS\DATA\outputData\covarince.txt'
field_offset = 5 # first element index to get
nb_lines = 3 # number of lines we want
line_index = 0 # current line
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if line == '' or line.startswith('#'):
# ignore comments and empty lines
continue
if line_index >= nb_lines:
# stop when we got all lines we want
break
# list of elements in line
elements = line.split()
print elements[line_index + field_offset]
line_index += 1

Related

Reading a specific portion of data from a .txt file in Python

>gene1
ATGATGATGGCG
>gene2
GGCATATC
CGGATACC
>gene3
TAGCTAGCCCGC
This is the text file which I am trying to read.
I want to read every gene in a different string and then add it in a list
There are header lines starting with ’>’ character to recognize if this is a start or end of a gene
with open('sequences1.txt') as input_data:
for line in input_data:
while line != ">":
list.append(line)
print(list)
When printed the list should display list should be
list =["ATGATGATGGCG","GGCATATCCGGATACC","TAGCTAGCCCGC"]
with open('sequences1.txt') as input_data:
sequences = []
gene = []
for line in input_data:
if line.startswith('>gene'):
if gene:
sequences.append(''.join(gene))
gene = []
else:
gene.append(line.strip())
sequences.append(''.join(gene)) # append last gene
print(sequences)
output:
['ATGATGATGGCG', 'GGCATATCCGGATACC', 'TAGCTAGCCCGC']
You have multiple mistakes in your code, look here:
with open('sequences1.txt', 'r') as file:
list = []
for line in file.read().split('\n'):
if not line.startswith(">") and len(line$
list.append(line)
print(list)
Try this:
$ cat genes.txt
>gene1
ATGATGATGGCG
>gene2
GGCATATC
CGGATACC
>gene3
TAGCTAGCCCGC
$ python
>>> genes = []
>>> with open('genes.txt') as file_:
... for line in f:
... if not line.startswith('>'):
... genes.append(line.strip())
...
>>> print(genes)
['ATGATGATGGCG', 'GGCATATC', 'CGGATACC', 'TAGCTAGCCCGC']
sequences1.txt:
>gene1
ATGATGATGGCG
>gene2
GGCATATC
CGGATACC
>gene3
TAGCTAGCCCGC
and then:
desired_text = []
with open('sequences1.txt') as input_data:
content = input_data.readlines()
content = [l.strip() for l in content if l.strip()]
for line in content:
if not line.startswith('>'):
desired_text.append(line)
print(desired_text)
OUTPUT:
['ATGATGATGGCG', 'GGCATATC', 'CGGATACC', 'TAGCTAGCCCGC']
EDIT:
Sped-read it, fixed it with the desired output
with open('sequences1.txt') as input_data:
content = input_data.readlines()
# you may also want to remove empty lines
content = [l.strip() for l in content if l.strip()]
# flag
nextLine = False
# list to save the lines
textList = []
concatenated = ''
for line in content:
find_TC = line.find('gene')
if find_TC > 0:
nextLine = not nextLine
else:
if nextLine:
textList.append(line)
else:
if find_TC < 0:
if concatenated != '':
concatenated = concatenated + line
textList.append(concatenated)
else:
concatenated = line
print(textList)
OUTPUT:
['ATGATGATGGCG', 'GGCATATCCGGATACC', 'TAGCTAGCCCGC']

In python2.7.11, why can't I remove the fileopen code?

The .txt file holding the data is as follows (source: "datingTestSet2.txt" in Ch.2 here):
40920 8.326976 0.953952 largeDoses
14488 7.153469 1.673904 smallDoses
26052 1.441871 0.805124 didntLike
75136 13.147394 0.428964 didntLike
38344 1.669788 0.134296 didntLike
...
Code:
from numpy import *
import operator
from os import listdir
def file2matrix(filename):
fr = open(filename)
# arr = fr.readlines() # Code1!!!!!!!!!!!!!!!!!!!
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename) # Code2!!!!!!!!!!!!!!!!!!!!!
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
The result of this function is:
datingDataMat datingLabels
40920 8.326976 0.953952 3
14488 7.153469 1.673904 2
26052 1.441871 0.805124 1
75136 13.147394 0.428964 1
38344 1.669788 0.134296 1
72993 10.141740 1.032955 1
35948 6.830792 1.213192 3
42666 13.276369 0.543880 3
67497 8.631577 0.749278 1
35483 12.273169 1.508053 3
50242 3.723498 0.831917 1
... ... ... ...
My questions are:
When I just remove the Code2(fr = open(filename) which above the index = 0),
the result of the function becomes all zeros matrix, and all zeros vector.
Why can't I remove the Code2? Doesn't the first line(fr = open(filename) work?
When I just add the Code1(arr = fr.readlines()), it is wrong. Why???
returnMat[index,:] = listFromLine[0:3]
IndexError: index 0 is out of bounds for axis 0 with size 0
1) You can't remove the Code2 line because of this line:
numberOfLines = len(fr.readlines()) #get the number of lines in the file
In that line you are reading to the end of the file. Opening it again puts you at the start of the file...
2) Similar to the answer above, if you do a call to readLines() that reads all the lines and moves the file cursor to the end of the file... So if you then try to readlines on the file again, there is nothing to read, hence it fails.
You are at the end of the file. Therefore, your second attempt to read the file content yields nothings. You need to go back to beginning of the file. Use:
fr.seek(0)
Instead of your:
fr = open(filename) # Code2!!!!!!!!!!!!!!!!!!!!!
You only need to readlines once.
def file2matrix(filename):
fr = open(filename)
lines = fr.readlines()
fr.close()
numberOfLines = len(lines) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
index = 0
for line in lines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
# careful here, returnMat is initialed as floats
# listFromLine is list of strings
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
I can suggest a few other changes:
def file2matrix(filename):
with open(filename) as f:
lines = f.readlines()
returnList = []
classLabelList = []
for line in lines:
listFromLine = line.strip().split('\t')
returnList.append(listFromLine[0:3])
classLabelList.append(int(listFromLine[-1]))
returnMat = np.array(returnList, dtype=float)
return returnMat, classLabelList
or even
def file2matrix(filename):
with open(filename) as f:
lines = f.readlines()
ll = [line.strip().split('\t')]
returnMat = np.array([l[0:3] for l in ll], dtype=float)
classLabelList = [int(l[-1]) for l in ll]
# classLabelVec = np.array([l[-1] for l in ll], dtype=int)
return returnMat, classLabelList

How can i compare two columns in two different rows in python

I want to go through each line of the a csv file and compare to see if the first field of line 1 is same as first field of next line and so on. If it finds a match then i would like ignore those two lines that contains the same fields and keep the lines where there is no match
Here is an example dataset (no_dup.txt)
Ac_Gene_ID M_Gene_ID
ENSGMOG00000015632 ENSORLG00000010573
ENSGMOG00000015632 ENSORLG00000010585
ENSGMOG00000003747 ENSORLG00000006947
ENSGMOG00000003748 ENSORLG00000004636
Basically i want to exclude line 1 and 2 since they contains the same fields (ENSGMOG00000015632) and keep lines 3 and 4
Here is the code i have tried but couldn't finish it
prev = None
with open("no_dup.txt", 'r') as fh_in:
for line in fh_in:
line = line.strip()
if line.startswith("E"):
line1 = line.split()
print "initial gene =", line1[0]
if prev is not None or prev!= line1[0]:
prev = line1[0]
I think a clean way of doing this would be to make a map of each entry -> list of lines.
entries = {}
with open('no_dup.txt', 'r') as fh_in:
for line in fg_in:
entry = line.split()[0]
if entry in entries:
entries[entry].append(line)
else:
entries[entry] = [line]
for matches in entries.iteritems():
if len(matches) == 1:
print matches[0]
You should note that this will NOT preserve the order of entries.
Your start looks good:
def filter_dups(iterable):
prev = None
for line in iterable:
if line.startswith("E"):
if prev.split(None, 1)[0] == line.split(None, 1)[0]:
prev = None
else:
if prev is not None:
yield prev
else:
prev = line
else:
yield line
prev = None
if prev is not None:
yield prev
with open("no_dup.txt", 'r') as fh_in:
with open("no_dup_out.txt", 'r') as fh_out:
fh_out.writelines(filter_dups(fh_in))
You can use this:
with open('a.txt','r') as inputFile:
lines = inputFile.readlines()
prev = lines[0]
for i in range(1, len(lines)):
cur = lines[i]
if prev.split()[0] != cur.split()[0]:
print prev.strip()
prev = cur
print lines[-1].strip()
Input:
ENSGMOG00000015632 ENSORLG00000010573
ENSGMOG00000015632 ENSORLG00000010585
ENSGMOG00000003747 ENSORLG00000006947
ENSGMOG00000003748 ENSORLG00000004636
Output:
ENSGMOG00000015632 ENSORLG00000010585
ENSGMOG00000003747 ENSORLG00000006947
ENSGMOG00000003748 ENSORLG00000004636

How to format the output of dictionary in python

I would like to format the values of a dictionary in python. Here is the script that i have used to generate the output
entries = {}
entries1 = {}
with open('no_dup.txt', 'r') as fh_in:
for line in fh_in:
if line.startswith('E'):
line = line.strip()
line = line.split()
entry = line[0]
if entry in entries:
entries[entry].append(line)
else:
entries[entry] = [line]
with open('no_dup_out.txt', 'w') as fh_out:
for kee, val in entries.iteritems():
if len(val) == 1:
fh_out.write("{} \n".format(val))
with open('no_dup_out.txt', 'r') as fh_in2:
for line in fh_in2:
line = line.strip()
line = line.split()
entry = line[1]
if entry in entries1:
entries1[entry].append(line)
else:
entries1[entry] = [line]
with open('no_dup_out_final.txt', 'w') as fh_out2:
for kee, val in entries1.iteritems():
if len(val) == 1:
fh_out2.write("{} \n".format(val))
For example by running the above script i generated the following output
[["[['ENSGMOG00000003747',", "'ENSORLG00000006947']]"]]
[["[['ENSGMOG00000003752',", "'ENSORLG00000005385']]"]]
[["[['ENSGMOG00000003760',", "'ENSORLG00000005379']]"]]
[["[['ENSGMOG00000003748',", "'ENSORLG00000004636']]"]]
[["[['ENSGMOG00000003761',", "'ENSORLG00000005382']]"]]
And i would like to format it such as way that i remove all the parentheses and commas (ENSGMOG00000003747 ENSORLG00000006947) and output the rest as it is using tab delimited format. How can i do that?
If your list of lists is full_list, then you could have the following code give your desired output:
desired_list = ['\t'.join([element.split('\'')[1] for element in list_item[0]]) for list_item in full_list]

Rewind the file pointer to the beginning of the previous line

I am doing text processing and using 'readline()' function as follows:
ifd = open(...)
for line in ifd:
while (condition)
do something...
line = ifd.readline()
condition = ....
#Here when the condition becomes false I need to rewind the pointer so that the 'for' loop read the same line again.
ifd.fseek() followed by readline is giving me a '\n' character. How to rewind the pointer so that the whole line is read again.
>>> ifd.seek(-1,1)
>>> line = ifd.readline()
>>> line
'\n'
Here is my code
labtestnames = sorted(tmp)
#Now read each line in the inFile and write into outFile
ifd = open(inFile, "r")
ofd = open(outFile, "w")
#read the header
header = ifd.readline() #Do nothing with this line. Skip
#Write header into the output file
nl = "mrn\tspecimen_id\tlab_number\tlogin_dt\tfluid"
offset = len(nl.split("\t"))
nl = nl + "\t" + "\t".join(labtestnames)
ofd.write(nl+"\n")
lenFields = len(nl.split("\t"))
print "Reading the input file and converting into modified file for further processing (correlation analysis etc..)"
prevTup = (0,0,0)
rowComplete = 0
k=0
for line in ifd:
k=k+1
if (k==200): break
items = line.rstrip("\n").split("\t")
if((items[0] =='')):
continue
newline= list('' for i in range(lenFields))
newline[0],newline[1],newline[3],newline[2],newline[4] = items[0], items[1], items[3], items[2], items[4]
ltests = []
ltvals = []
while(cmp(prevTup, (items[0], items[1], items[3])) == 0): # If the same mrn, lab_number and specimen_id then fill the same row. else create a new row.
ltests.append(items[6])
ltvals.append(items[7])
pos = ifd.tell()
line = ifd.readline()
prevTup = (items[0], items[1], items[3])
items = line.rstrip("\n").split("\t")
rowComplete = 1
if (rowComplete == 1): #If the row is completed, prepare newline and write into outfile
indices = [labtestnames.index(x) for x in ltests]
j=0
ifd.seek(pos)
for i in indices:
newline[i+offset] = ltvals[j]
j=j+1
if (rowComplete == 0): #
currTup = (items[0], items[1], items[3])
ltests = items[6]
ltvals = items[7]
pos = ifd.tell()
line = ifd.readline()
items = line.rstrip("\n").split("\t")
newTup = (items[0], items[1], items[3])
if(cmp(currTup, newTup) == 0):
prevTup = currTup
ifd.seek(pos)
continue
else:
indices = labtestnames.index(ltests)
newline[indices+offset] = ltvals
ofd.write(newline+"\n")
The problem can be handled more simply using itertools.groupby. groupby can cluster all the contiguous lines that deal with the same mrn, specimen_id, and lab_num.
The code that does this is
for key, group in IT.groupby(reader, key = mykey):
where reader iterates over the lines of the input file, and mykey is defined by
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
Each row from reader is passed to mykey, and all rows with the same key are clustered together in the same group.
While we're at it, we might as well use the csv module to read each line into a dict (which I call row). This frees us from having to deal with low-level string manipulation like line.rstrip("\n").split("\t") and instead of referring to columns by index numbers (e.g. row[3]) we can write code that speaks in higher-level terms such as row['lab_num'].
import itertools as IT
import csv
inFile = 'curious.dat'
outFile = 'curious.out'
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
fieldnames = 'mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate'.split()
with open(inFile, 'rb') as ifd:
reader = csv.DictReader(ifd, delimiter = '\t')
with open(outFile, 'wb') as ofd:
writer = csv.DictWriter(
ofd, fieldnames, delimiter = '\t', lineterminator = '\n', )
writer.writeheader()
for key, group in IT.groupby(reader, key = mykey):
new = {}
row = next(group)
for key in ('mrn', 'specimen_id', 'date', 'lab_num'):
new[key] = row[key]
new[row['labtest']] = row['result_val']
for row in group:
new[row['labtest']] = row['result_val']
writer.writerow(new)
yields
mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate
4419529 1614487 26.2675 5802791G 0.1
3319529 1614487 26.2675 5802791G 0.3 153 8.1 2.1 4
5713871 682571 56.0779 9732266E 4.1
This seems to be a perfect use case for yield expressions. Consider the following example that prints lines from a file, repeating some of them at random:
def buflines(fp):
r = None
while True:
r = yield r or next(fp)
if r:
yield None
from random import randint
with open('filename') as fp:
buf = buflines(fp)
for line in buf:
print line
if randint(1, 100) > 80:
print 'ONCE AGAIN::'
buf.send(line)
Basically, if you want to process an item once again, you send it back to the generator. On the next iteration you will be reading the same item once again.

Categories