The .txt file holding the data is as follows (source: "datingTestSet2.txt" in Ch.2 here):
40920 8.326976 0.953952 largeDoses
14488 7.153469 1.673904 smallDoses
26052 1.441871 0.805124 didntLike
75136 13.147394 0.428964 didntLike
38344 1.669788 0.134296 didntLike
...
Code:
from numpy import *
import operator
from os import listdir
def file2matrix(filename):
fr = open(filename)
# arr = fr.readlines() # Code1!!!!!!!!!!!!!!!!!!!
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename) # Code2!!!!!!!!!!!!!!!!!!!!!
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
The result of this function is:
datingDataMat datingLabels
40920 8.326976 0.953952 3
14488 7.153469 1.673904 2
26052 1.441871 0.805124 1
75136 13.147394 0.428964 1
38344 1.669788 0.134296 1
72993 10.141740 1.032955 1
35948 6.830792 1.213192 3
42666 13.276369 0.543880 3
67497 8.631577 0.749278 1
35483 12.273169 1.508053 3
50242 3.723498 0.831917 1
... ... ... ...
My questions are:
When I just remove the Code2(fr = open(filename) which above the index = 0),
the result of the function becomes all zeros matrix, and all zeros vector.
Why can't I remove the Code2? Doesn't the first line(fr = open(filename) work?
When I just add the Code1(arr = fr.readlines()), it is wrong. Why???
returnMat[index,:] = listFromLine[0:3]
IndexError: index 0 is out of bounds for axis 0 with size 0
1) You can't remove the Code2 line because of this line:
numberOfLines = len(fr.readlines()) #get the number of lines in the file
In that line you are reading to the end of the file. Opening it again puts you at the start of the file...
2) Similar to the answer above, if you do a call to readLines() that reads all the lines and moves the file cursor to the end of the file... So if you then try to readlines on the file again, there is nothing to read, hence it fails.
You are at the end of the file. Therefore, your second attempt to read the file content yields nothings. You need to go back to beginning of the file. Use:
fr.seek(0)
Instead of your:
fr = open(filename) # Code2!!!!!!!!!!!!!!!!!!!!!
You only need to readlines once.
def file2matrix(filename):
fr = open(filename)
lines = fr.readlines()
fr.close()
numberOfLines = len(lines) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
index = 0
for line in lines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
# careful here, returnMat is initialed as floats
# listFromLine is list of strings
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
I can suggest a few other changes:
def file2matrix(filename):
with open(filename) as f:
lines = f.readlines()
returnList = []
classLabelList = []
for line in lines:
listFromLine = line.strip().split('\t')
returnList.append(listFromLine[0:3])
classLabelList.append(int(listFromLine[-1]))
returnMat = np.array(returnList, dtype=float)
return returnMat, classLabelList
or even
def file2matrix(filename):
with open(filename) as f:
lines = f.readlines()
ll = [line.strip().split('\t')]
returnMat = np.array([l[0:3] for l in ll], dtype=float)
classLabelList = [int(l[-1]) for l in ll]
# classLabelVec = np.array([l[-1] for l in ll], dtype=int)
return returnMat, classLabelList
Related
This function didn't convert file into matrix prperly, while I run the code I got error message:
returnMat[index,:] = listFromLine[0:3]
ValueError: could not convert string to float.
def fileToMatrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines())
returnMat = np.zeros((numberOfLines,3))
classLabelVector = []
fr = open(filename)
index = 0
for line in fr.readline():
line = line.strip() #split in the end of the line
listFromLine = line.split('\t') # split on tab and make list
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
if __name__ == '__main__':
filename ="mydata.txt"
returnMat,classLabelVector=fileToMatrix(filename)
print(returnMat)
and data file looks like:
enter image description here
haha... error is in the line:
for line in fr.readline():
I should have used readlines() function to read all the line of the .txt file
I have a python script that is checking data from a text file and writing it if it meets the right parameters. So far I have:
# -*- coding: utf-8 -*-
import math
f = open("COLLISON.txt", 'r')
linelist = f.readlines()
f.close
f2 = open("All_Collisions_Raw_Data.txt", "w")
for line in linelist:
if 'Û' in line[0]:
f2.write(line)
f2.close()
f3 = open("Primary_Recoils_Raw_Data.txt", "w")
for line in linelist:
if 'Prime Recoil' in line:
f3.write(line)
f3.close()
S = raw_input('Are you analysing a sphere?\n Y/n \n')
if S == 'Y' or S == 'y':
rad = input('What is the radius of the sphere in Angstroms? \n')
f14 = open('All_Collisions_in_sphere', 'w')
for line in linelist:
if len(line) >55:
if 'Û' in line[0]:
Xa = float(''.join(line[25:29]))
Xs = float((Xa - rad))
Ya = float(''.join(line[36:40]))
Za = float(''.join(line[47:51]))
Xf = float(''.join(line[31:34]))
Yf = float(''.join(line[42:45]))
Zf = float(''.join(line[53:56]))
Xf1 = float(10**Xf)
Yf1 = float(10**Yf)
Zf1 = float(10**Zf)
Xd = float((Xs*Xf1))
Yd = float((Ya*Yf1))
Zd = float((Za*Zf1))
Xb = float((Xd*Xd))
Yb = float((Yd*Yd))
Zb = float((Zd*Zd))
ra = float(Xb + Yb + Zb)
r = float(math.sqrt(ra))
I = (line[6])
if r < rad:
f14.write(line)
f14.close()
I only want to write if I = 1 or is equal to the previous lines I + 1. However I'm unsure how to call the previous line, or keep the current line for future recall. Does anyone know how i can achieve this?
One way is to just store the previous (we initialise to None and check if it is None):
prev = None
for line in file:
if prev is not None:
if line == prev:
# do stuff
prev = line
Another way is to user iterators
itr = iter(file)
prev = next(itr)
for line in itr:
if line == prev:
# do stuff
prev = line
Edit
If you want to get each line number as well, use the enumerate function:
for line_number, line in enumerate(file, start=1):
...
Just as an FYI don't do
file = open(path)
linelist = file.readlines()
file.close()
for line in linelist:
...
but instead do this:
with open(path) as file:
for line in file:
...
The reason is that the first method reads the entire file into memory and will not close the file if an exception happens in the read, which could corrupt the file. the with statement handles that all for you, and then you can iterate over the file directly.
i'm trying get value from table text file.
filename = open('D:\THESIS\DATA\outputData\covarince.txt', 'r')
for n in xrange(21):
next(filename)
#for line in filename:
for line in filename:
line = line.strip()
word = line.split()
print word [5:9]
but result ....
['1.858598e+007', '1.771380e+007', '1.680333e+007', '3.094793e+007']
['1.755510e+007', '1.675123e+007', '1.592444e+007', '2.924262e+007']
['1.667081e+007', '1.593449e+007', '1.525907e+007', '2.744351e+007']
['3.140037e+007', '2.997102e+007', '2.821130e+007', '5.446339e+007']
['2.610668e+007', '2.504934e+007', '2.423942e+007', '4.068118e+007']
['2.504934e+007', '2.410118e+007', '2.337335e+007', '3.907932e+007']
['2.423942e+007', '2.337335e+007', '2.292371e+007', '3.696649e+007']
['4.068118e+007', '3.907932e+007', '3.696649e+007', '7.047854e+007']
[]
[]
[]
[]
[]
['4', '5', '6', '7']
[]
['0.83751', '0.83075', '0.80804', '0.84876']
['0.83533', '0.82958', '0.80864', '0.84687']
['0.82648', '0.82219', '0.80731', '0.82806']
['0.81225', '0.80689', '0.77877', '0.85745']
['1.00000', '0.99862', '0.99084', '0.94839']
['0.99862', '1.00000', '0.99440', '0.94820']
['0.99084', '0.99440', '1.00000', '0.91968']
['0.94839', '0.94820', '0.91968', '1.00000']
[]
i want only value in yellow highlight:
This should get you started
filepath = 'D:\THESIS\DATA\outputData\covarince.txt'
field_offset = 5 # first element index to get
nb_lines = 3 # number of lines we want
line_index = 0 # current line
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if line == '' or line.startswith('#'):
# ignore comments and empty lines
continue
if line_index >= nb_lines:
# stop when we got all lines we want
break
# list of elements in line
elements = line.split()
print elements[line_index + field_offset]
line_index += 1
The code below have two same lines, but I think fr is already opened by the first line. I try to remove the second lines, but the code failed. So why we need to the open file everytime when we use it?
def file2matrix(filename):
fr = open(filename) #<-------------------------
numberOfLines = len(fr.readlines())
returnMat = np.zeros((numberOfLines,3))
classLabelVector = []
fr = open(filename) # <------------------------
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(listFromLine[-1])
index += 1
return returnMat, classLabelVector
You don't need to reopen the file, but you do need to go back to the beginning.
The readline() function reads a line in a file. Each time you call readline(), the pointer will move to the next line.
readlines() calls readline() until it gets to the end of the file. If you want to move back to the beginning, you need to reopen the file. (Generally, it's better practice to actually close and then reopen the file. Even if you're only reading once, you should close the file at the end.)
If you only want to go through the file once, you can count the number of lines as you move through the file, and then return that number.
Original:
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines())
returnMat = np.zeros((numberOfLines,3))
classLabelVector = []
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(listFromLine[-1])
index += 1
return returnMat, classLabelVector
Another way:
def file2matrix(filename):
fr = open(filename)
classLabelVector = []
index = 0
line = ''
numberOfLines = 0
while line is not None:
numberOfLines += 1
line = fr.readline()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(listFromLine[-1])
index += 1
fr.close()
returnMat = np.zeros((numberOfLines,3))
return returnMat, classLabelVector
I am doing text processing and using 'readline()' function as follows:
ifd = open(...)
for line in ifd:
while (condition)
do something...
line = ifd.readline()
condition = ....
#Here when the condition becomes false I need to rewind the pointer so that the 'for' loop read the same line again.
ifd.fseek() followed by readline is giving me a '\n' character. How to rewind the pointer so that the whole line is read again.
>>> ifd.seek(-1,1)
>>> line = ifd.readline()
>>> line
'\n'
Here is my code
labtestnames = sorted(tmp)
#Now read each line in the inFile and write into outFile
ifd = open(inFile, "r")
ofd = open(outFile, "w")
#read the header
header = ifd.readline() #Do nothing with this line. Skip
#Write header into the output file
nl = "mrn\tspecimen_id\tlab_number\tlogin_dt\tfluid"
offset = len(nl.split("\t"))
nl = nl + "\t" + "\t".join(labtestnames)
ofd.write(nl+"\n")
lenFields = len(nl.split("\t"))
print "Reading the input file and converting into modified file for further processing (correlation analysis etc..)"
prevTup = (0,0,0)
rowComplete = 0
k=0
for line in ifd:
k=k+1
if (k==200): break
items = line.rstrip("\n").split("\t")
if((items[0] =='')):
continue
newline= list('' for i in range(lenFields))
newline[0],newline[1],newline[3],newline[2],newline[4] = items[0], items[1], items[3], items[2], items[4]
ltests = []
ltvals = []
while(cmp(prevTup, (items[0], items[1], items[3])) == 0): # If the same mrn, lab_number and specimen_id then fill the same row. else create a new row.
ltests.append(items[6])
ltvals.append(items[7])
pos = ifd.tell()
line = ifd.readline()
prevTup = (items[0], items[1], items[3])
items = line.rstrip("\n").split("\t")
rowComplete = 1
if (rowComplete == 1): #If the row is completed, prepare newline and write into outfile
indices = [labtestnames.index(x) for x in ltests]
j=0
ifd.seek(pos)
for i in indices:
newline[i+offset] = ltvals[j]
j=j+1
if (rowComplete == 0): #
currTup = (items[0], items[1], items[3])
ltests = items[6]
ltvals = items[7]
pos = ifd.tell()
line = ifd.readline()
items = line.rstrip("\n").split("\t")
newTup = (items[0], items[1], items[3])
if(cmp(currTup, newTup) == 0):
prevTup = currTup
ifd.seek(pos)
continue
else:
indices = labtestnames.index(ltests)
newline[indices+offset] = ltvals
ofd.write(newline+"\n")
The problem can be handled more simply using itertools.groupby. groupby can cluster all the contiguous lines that deal with the same mrn, specimen_id, and lab_num.
The code that does this is
for key, group in IT.groupby(reader, key = mykey):
where reader iterates over the lines of the input file, and mykey is defined by
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
Each row from reader is passed to mykey, and all rows with the same key are clustered together in the same group.
While we're at it, we might as well use the csv module to read each line into a dict (which I call row). This frees us from having to deal with low-level string manipulation like line.rstrip("\n").split("\t") and instead of referring to columns by index numbers (e.g. row[3]) we can write code that speaks in higher-level terms such as row['lab_num'].
import itertools as IT
import csv
inFile = 'curious.dat'
outFile = 'curious.out'
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
fieldnames = 'mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate'.split()
with open(inFile, 'rb') as ifd:
reader = csv.DictReader(ifd, delimiter = '\t')
with open(outFile, 'wb') as ofd:
writer = csv.DictWriter(
ofd, fieldnames, delimiter = '\t', lineterminator = '\n', )
writer.writeheader()
for key, group in IT.groupby(reader, key = mykey):
new = {}
row = next(group)
for key in ('mrn', 'specimen_id', 'date', 'lab_num'):
new[key] = row[key]
new[row['labtest']] = row['result_val']
for row in group:
new[row['labtest']] = row['result_val']
writer.writerow(new)
yields
mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate
4419529 1614487 26.2675 5802791G 0.1
3319529 1614487 26.2675 5802791G 0.3 153 8.1 2.1 4
5713871 682571 56.0779 9732266E 4.1
This seems to be a perfect use case for yield expressions. Consider the following example that prints lines from a file, repeating some of them at random:
def buflines(fp):
r = None
while True:
r = yield r or next(fp)
if r:
yield None
from random import randint
with open('filename') as fp:
buf = buflines(fp)
for line in buf:
print line
if randint(1, 100) > 80:
print 'ONCE AGAIN::'
buf.send(line)
Basically, if you want to process an item once again, you send it back to the generator. On the next iteration you will be reading the same item once again.