This function didn't convert file into matrix prperly, while I run the code I got error message:
returnMat[index,:] = listFromLine[0:3]
ValueError: could not convert string to float.
def fileToMatrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines())
returnMat = np.zeros((numberOfLines,3))
classLabelVector = []
fr = open(filename)
index = 0
for line in fr.readline():
line = line.strip() #split in the end of the line
listFromLine = line.split('\t') # split on tab and make list
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
if __name__ == '__main__':
filename ="mydata.txt"
returnMat,classLabelVector=fileToMatrix(filename)
print(returnMat)
and data file looks like:
enter image description here
haha... error is in the line:
for line in fr.readline():
I should have used readlines() function to read all the line of the .txt file
Related
I am working on creating a program to concatenate rows within a file. Each file has a header, datarows labeled DAT001 to DAT113 and a trailer. Each line of concatenated rows will have DAT001 to DAT100 and 102-113 is optional. I need to print the header, concatenating DAT001-113 and when the file finds a row with DAT001 I need to start a new line concatenating DAT001-113 again. After that is all done, I will print the trailer. I have an IF statement started but it only writes the header and skips all other logic. I apologize that this is very basic - but I am struggling with reading rows over and over again without knowing how long the file might be.
I have tried the below code but it won't read or print after the header.
import pandas as pd
destinationFile = "./destination-file.csv"
sourceFile = "./TEST.txt"
header = "RHR"
data = "DPSPOS"
beg_data = "DAT001"
data2 = "DAT002"
data3 = "DAT003"
data4 = "DAT004"
data5 = "DAT005"
data6 = "DAT006"
data7 = "DAT007"
data8 = "DAT008"
data100 = "DAT100"
data101 = "DAT101"
data102 = "DAT102"
data103 = "DAT103"
data104 = "DAT104"
data105 = "DAT105"
data106 = "DAT106"
data107 = "DAT107"
data108 = "DAT108"
data109 = "DAT109"
data110 = "DAT110"
data111 = "DAT111"
data112 = "DAT112"
data113 = "DAT113"
req_data = ''
opt101 = ''
opt102 = ''
with open(sourceFile) as Tst:
for line in Tst.read().split("\n"):
if header in line:
with open(destinationFile, "w+") as dst:
dst.write(line)
elif data in line:
if beg_data in line:
req_data = line+line+line+line+line+line+line+line+line
if data101 in line:
opt101 = line
if data102 in line:
opt102 = line
new_line = pd.concat(req_data,opt101,opt102)
with open(destinationFile, "w+") as dst:
dst.write(new_line)
else:
if trailer in line:
with open(destinationFile, "w+") as dst:
dst.write(line)
Just open the output file once for the whole loop, not every time through the loop.
Check whether the line begins with DAT101. If it does, write the trailer to the current line and start a new line by printing the header.
Then for every line that begins with DAT, write it to the file in the current line.
first_line = True
with open(sourceFile) as Tst, open(destinationFile, "w+") as dst:
for line in Tst.read().split("\n"):
# start a new line when reading DAT101
if line.startswith(beg_data):
if not first_line: # need to end the current line
dst.write(trailer + '\n')
first_line = False
dst.write(header)
# copy all the lines that begin with `DAT`
if line.startswith('DAT'):
dst.write(line)
# end the last line
dst.write(trailer + '\n')
See if the following code helps make progress. It was not tested because no
Minimum Runnable Example is provided.
with open(destinationFile, "a") as dst:
# The above will keep the file open until after all the indented code runs
with open(sourceFile) as Tst:
# The above will keep the file open until after all the indented code runs
for line in Tst.read().split("\n"):
if header in line:
dst.write(line)
elif data in line:
if beg_data in line:
req_data = line + line + line + line + line + line + line + line + line
if data101 in line:
opt101 = line
if data102 in line:
opt102 = line
new_line = pd.concat(req_data, opt101, opt102)
dst.write(new_line)
else:
if trailer in line:
dst.write(line)
# With is a context manager which will automatically close the files.
file = open(fullname, 'r')
for line in file:
if line.endswith('\n'): line = line[:-1]
line = line.split(',')
for tile in line:
index = line.index(tile)
tile = tile.split('>')
print(tile)
copies = int(tile[1])
tile = tile * copies
line[index:index+1] = tile
the text file format:
block>20, otherblock>10
the output:
['block', '20']
['20']
Traceback (most recent call last):
File "C:/Users/CAIO/Documents/Pycharm/vitoria/main.py", line 92, in <module>
main()
File "C:/Users/CAIO/Documents/Pycharm/vitoria/main.py", line 77, in main
test_map = MapClass("map.txt")
File "C:/Users/CAIO/Documents/Pycharm/vitoria/main.py", line 23, in __init__
self.load_map(name)
File "C:/Users/CAIO/Documents/Pycharm/vitoria/main.py", line 39, in load_map
copies = int(tile[1])
IndexError: list index out of range
Process finished with exit code 1
when i reference tile[1] it states the index is invalid, and when i reference tile[0] it's just 'block', not ['block','10'].
printing tile before spliting results in :
'block>20'
'20'
i'm too tired for this at this point, it's probably something dumb i'm skipping
Based on your code, it looks like you want to expand the 'text>count' format with the text repeated 'count' times.
Try this code. Note that this code removes the leading space before the text.
ss = '''
block>20, otherblock>10
b2>21, ob2>12
b3>22, ob3>13
'''.strip()
with open('test.csv','w') as f: f.write(ss) # write test file
##############
fullname = 'test.csv'
alllines = []
file = open(fullname, 'r')
for line in file:
lineout = line
if line.endswith('\n'): line = line[:-1]
line = line.split(',')
for idx,tile in enumerate(line):
#index = line.index(tile)
tilex = tile.strip().split('>')
copies = int(tilex[1])
tilex2 = tilex[0] * copies
lineout = lineout.replace(tile, tilex2)
alllines.append(lineout)
print(''.join(alllines))
Output
blockblockblockblockblockblockblockblockblockblockblockblockblockblockblockblockblockblockblockblock,otherblockotherblockotherblockotherblockotherblockotherblockotherblockotherblockotherblockotherblock
b2b2b2b2b2b2b2b2b2b2b2b2b2b2b2b2b2b2b2b2b2,ob2ob2ob2ob2ob2ob2ob2ob2ob2ob2ob2ob2
b3b3b3b3b3b3b3b3b3b3b3b3b3b3b3b3b3b3b3b3b3b3,ob3ob3ob3ob3ob3ob3ob3ob3ob3ob3ob3ob3ob3
If you want minimal code, you can use list comprehension.
ss = '''
block>20, otherblock>10
b2>21, ob2>12
b3>22, ob3>13
'''.strip()
with open('test.csv','w') as f: f.write(ss) # write test file
#######################
with open(fullname, 'r') as f:
lines = f.readlines()
xx = '\n'.join([','.join([e.split('>')[0]*int(e.split('>')[1]) for e in ln.split(', ')]) for ln in lines])
print(xx)
Output is the same
I am trying to read 3 log files and use parsing to extract the requered information; I need this code to run in a loop and obtain new lines if they meet requered parameters.
I wrote the following code:
import os
x_list = []
y_list = []
z_list = []
x_log = open('x.txt')
for line in x_log:
line = line.rstrip()
if 'error' in line:
x = line
for x in x_log:
if not x in x_log:
x_list.append(x)
print('ERROR1',x)
y_log = open('y.txt')
for line in y_log:
line = line.rstrip()
if 'error' in line:
x = line
for x in y_list:
if not x in y_list:
y_list.append(x)
print('ERROR2',x)
z_log = open('z.txt')
for line in z_log:
line = line.rstrip()
if 'error' in line:
x = line
for x in z_log:
if not x in z_list:
z_list.append(x)
print('ERROR3',x)
what I am trying to accomplish:
1. read the file.
2. search for relevant line.
3. if the information does not exist in the list, append to list.
4. print line.
I need help setting a while loop, and I am decently doing something wrong while comparing the line to the content of the list.
UPDATE1:
Ok so I managed to get my code to work by adding:
and line not in x_list:
to my original line:
if 'error' in line:
so now I got:
if 'error' in line and line not in x_list:
full code:
x_list = []
y_list = []
z_list = []
x_log = open('x.txt')
for line in x_log:
line = line.rstrip()
if 'error' in line and line not in x_list:
x_list.append(line)
print('ERROR-X',line)
y_log = open('y.txt')
for line in y_log:
line = line.rstrip()
if 'error' in line and line not in y_list:
y_list.append(line)
print('ERROR-Y',line)
z_log = open('z.txt')
for line in z_log:
line = line.rstrip()
if 'error' in line and line not in z_list:
z_list.append(line)
print('ERROR-Z',line)
it does what i need but i still need to run it in a loop, can anyone help me?
UPDATE2:
managed to get it to work in a loop, if a new line is added and it meets the parsing parameters it will be printed.
code:
x_list = []
y_list = []
z_list = []
t = 1
while t == 1:
x_log = open('x.txt','r')
for line in x_log:
line = line.rstrip()
if 'error' in line and line not in x_list:
x_list.append(line)
print('ERROR-X',line)
y_log = open('y.txt','r')
for line in y_log:
line = line.rstrip()
if 'error' in line and line not in y_list:
y_list.append(line)
print('ERROR-Y',line)
z_log = open('z.txt','r')
for line in z_log:
line = line.rstrip()
if 'error' in line and line not in z_list:
z_list.append(line)
print('ERROR-Z',line)
The optimized approach:
def get_error_lines(fp, lines_set, suffix=''):
''' fp - file pointer;
lines_set - a set of unique error lines;
sufix - ERROR number(suffix) '''
for line in fp:
line = line.rstrip()
if 'error' in line and line not in lines_set:
lines_set.add(line)
print('ERROR' + suffix, line)
# using set objects to hold unique items
x_set = set()
y_set = set()
z_set = set()
with open('x.txt', 'r') as x_log, open('y.txt', 'r') as y_log, open('z.txt', 'r') as z_log:
get_error_lines(x_log, x_set, '1')
get_error_lines(y_log, y_set, '2')
get_error_lines(z_log, z_set, '3')
The code below have two same lines, but I think fr is already opened by the first line. I try to remove the second lines, but the code failed. So why we need to the open file everytime when we use it?
def file2matrix(filename):
fr = open(filename) #<-------------------------
numberOfLines = len(fr.readlines())
returnMat = np.zeros((numberOfLines,3))
classLabelVector = []
fr = open(filename) # <------------------------
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(listFromLine[-1])
index += 1
return returnMat, classLabelVector
You don't need to reopen the file, but you do need to go back to the beginning.
The readline() function reads a line in a file. Each time you call readline(), the pointer will move to the next line.
readlines() calls readline() until it gets to the end of the file. If you want to move back to the beginning, you need to reopen the file. (Generally, it's better practice to actually close and then reopen the file. Even if you're only reading once, you should close the file at the end.)
If you only want to go through the file once, you can count the number of lines as you move through the file, and then return that number.
Original:
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines())
returnMat = np.zeros((numberOfLines,3))
classLabelVector = []
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(listFromLine[-1])
index += 1
return returnMat, classLabelVector
Another way:
def file2matrix(filename):
fr = open(filename)
classLabelVector = []
index = 0
line = ''
numberOfLines = 0
while line is not None:
numberOfLines += 1
line = fr.readline()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(listFromLine[-1])
index += 1
fr.close()
returnMat = np.zeros((numberOfLines,3))
return returnMat, classLabelVector
The .txt file holding the data is as follows (source: "datingTestSet2.txt" in Ch.2 here):
40920 8.326976 0.953952 largeDoses
14488 7.153469 1.673904 smallDoses
26052 1.441871 0.805124 didntLike
75136 13.147394 0.428964 didntLike
38344 1.669788 0.134296 didntLike
...
Code:
from numpy import *
import operator
from os import listdir
def file2matrix(filename):
fr = open(filename)
# arr = fr.readlines() # Code1!!!!!!!!!!!!!!!!!!!
numberOfLines = len(fr.readlines()) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
fr = open(filename) # Code2!!!!!!!!!!!!!!!!!!!!!
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
The result of this function is:
datingDataMat datingLabels
40920 8.326976 0.953952 3
14488 7.153469 1.673904 2
26052 1.441871 0.805124 1
75136 13.147394 0.428964 1
38344 1.669788 0.134296 1
72993 10.141740 1.032955 1
35948 6.830792 1.213192 3
42666 13.276369 0.543880 3
67497 8.631577 0.749278 1
35483 12.273169 1.508053 3
50242 3.723498 0.831917 1
... ... ... ...
My questions are:
When I just remove the Code2(fr = open(filename) which above the index = 0),
the result of the function becomes all zeros matrix, and all zeros vector.
Why can't I remove the Code2? Doesn't the first line(fr = open(filename) work?
When I just add the Code1(arr = fr.readlines()), it is wrong. Why???
returnMat[index,:] = listFromLine[0:3]
IndexError: index 0 is out of bounds for axis 0 with size 0
1) You can't remove the Code2 line because of this line:
numberOfLines = len(fr.readlines()) #get the number of lines in the file
In that line you are reading to the end of the file. Opening it again puts you at the start of the file...
2) Similar to the answer above, if you do a call to readLines() that reads all the lines and moves the file cursor to the end of the file... So if you then try to readlines on the file again, there is nothing to read, hence it fails.
You are at the end of the file. Therefore, your second attempt to read the file content yields nothings. You need to go back to beginning of the file. Use:
fr.seek(0)
Instead of your:
fr = open(filename) # Code2!!!!!!!!!!!!!!!!!!!!!
You only need to readlines once.
def file2matrix(filename):
fr = open(filename)
lines = fr.readlines()
fr.close()
numberOfLines = len(lines) #get the number of lines in the file
returnMat = zeros((numberOfLines,3)) #prepare matrix to return
classLabelVector = [] #prepare labels return
index = 0
for line in lines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
# careful here, returnMat is initialed as floats
# listFromLine is list of strings
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
I can suggest a few other changes:
def file2matrix(filename):
with open(filename) as f:
lines = f.readlines()
returnList = []
classLabelList = []
for line in lines:
listFromLine = line.strip().split('\t')
returnList.append(listFromLine[0:3])
classLabelList.append(int(listFromLine[-1]))
returnMat = np.array(returnList, dtype=float)
return returnMat, classLabelList
or even
def file2matrix(filename):
with open(filename) as f:
lines = f.readlines()
ll = [line.strip().split('\t')]
returnMat = np.array([l[0:3] for l in ll], dtype=float)
classLabelList = [int(l[-1]) for l in ll]
# classLabelVec = np.array([l[-1] for l in ll], dtype=int)
return returnMat, classLabelList