python: read file and split the data

python: read file and split the data - python

I have a file config and the contents are separated by space " "
cat config
/home/user1 *.log,*.txt 30
/home/user2 *.trm,*.doc,*.jpeg 10
I want to read this file,parse each line and print each field from the each line.
Ex:-
Dir = /home/user1
Fileext = *.log,*.txt
days=30
I couldn't go further than the below..
def dir():
file = open('config','r+')
cont = file.readlines()
print "file contents are %s" % cont
for i in range(len(cont)):
j = cont[i].split(' ')
dir()
Any pointers how to move further?

Your code is fine, you are just missing the last step processing each element of the splitted string, try this:
def dir():
file = open('config','r+')
cont = file.readlines()
print "file contents are %s" % cont + '\n'
elements = []
for i in range(len(cont)):
rowElems = cont[i].split(' ')
elements.append({ 'dir' : rowElems[0], 'ext' : rowElems[1], 'days' : rowElems[2] })
for e in elements:
print "Dir = " + e['dir']
print "Fileext = " + e['ext']
print "days = " + e['days']
dir()
At the end of this code, you will have all the rows processed and stored in an array of dictionaries you can easily access later.

You can write a custom function to parse each line, and then use the map function to apply that function against each line in file.readlines():
def parseLine(line):
# function to split and parse each line,
# and return the formatted string
Dir, FileExt, Days = line.split(' ')[:3]
return 'Dir = {}\nFileext = {}\nDays = {}'.format(Dir, FileExt, Days)
def dir():
with open('config','r+') as file:
print 'file contents are\n' + '\n'.join(map(parseLine, file.readlines()))
Results:
>>> dir()
file contents are
Dir = /home/user1
Fileext = *.log,*.txt
Days = 30
Dir = /home/user2
Fileext = *.trm,*.doc,*.jpeg
Days = 10

Related

Python coding for opening and saving data to a file

I am having an issue getting the train function to work correctly in python. I can not modify the def function. I am at the point where I need to get the second file to read lines one at a time for PosList and i need to match the value of movieWordCount[z] in OpenPos. If the file is there, then I am good to incrment column 2 by one of t hat line (segmented by a space). If it is not, then I need the else to append it to the file end. It does not work. It does not append the values if it is missing and I am not sure if it will find the value if it is there. I have been stuck getting thsi to work for two days.
Here is my code segment I am working with:
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
#Now use tokenize to split it apart by space and set to new array for me to call column2
else:
print "not found"
lines.append(movieWordCount[z] + " 1" + "\n")
Here is my full code:
#!/usr/bin/python
#Import Counter
import collections
from collections import Counter
#Was already here but pickle is used for data input and export
import math, os, pickle, re
class Bayes_Classifier:
def __init__(self, trainDirectory = "movie_reviews/"):
#If file listing exists skip to train
if os.path.isfile('iFileList'):
print "file found"
self.train()
#self.classify()
#If file listing does not exist skip to train
if not os.path.isfile('iFileList'):
print "no file"
newfile = 'iFileList'
tempList = set()
subDir = './movie_reviews'
for filenames in os.listdir(subDir):
my_sub_path = os.path.join(os.sep,subDir,filenames)
tempList.add(filenames)
self.save("filenames", "try3")
f = []
for fFileObj in os.walk("movie_reviews/"):
f.extend(fFileObj)
break
pickle.dump(f, open( "save.p", "wb" ))
self.save(f, "try4")
with open(newfile, 'wb') as fi:
pickle.dump(tempList, fi)
#print tempList
self.train()
#self.classify()
def train(self):
'''Trains the Naive Bayes Sentiment Classifier.'''
print "File ready for training"
#Open iFileList to use as input for opening movie files
x = 0
OpenIFileList = open('iFileList','r')
print "iFileList now Open"
#Loop through the file
for line in OpenIFileList:
#print "Ready to read lines"
#print "reading line " + line
if x > 4:
if x % 2 == 0:
#print line
s = line
if '-' in s:
comp = s.split("'")
#print comp[2]
print comp[1] #This is What you need for t he movie file
compValue1 = comp[1]
#Determine Positive/Negative.
#compType is the variable I am storing it to.
compType = compValue1.split("-",2)[1]
#print compType #Prints that middle value like 5 or 1
# This will do the work based on the value.
if compType == '5':
# print "you have a five" #Confirms the loop I am in.
#If file does not exists create it
if not os.path.exists('PosList'):
print "no file"
file('PosList', 'w').close()
#Open file that needs to be reviewed for word count
compValue2 = "movie_reviews/" + compValue1
print compValue2 #Prints the directory and file path
OpenMovieList = open(compValue2,'r')
for commentLine in OpenMovieList:
commentPositive = commentLine.split(" ")
commentPositiveCounter = Counter(commentPositive)
#print commentPositiveCounter # " Comment Pos goes here"
#if commentLine != '' or commentLine != ' ':
#Get first word, second word, ....
if commentLine and (not commentLine.isspace()):
movieWordCount = self.tokenize(commentLine)
y = len(movieWordCount) #determines length of string
print y
z = 0
#print movieWordCount[0] # Shows the zero position in the file.
while z < y:
print "position " + str(z) + " word is " + movieWordCount[z] # Shows the word we are at and position id
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
else:
print "not found"
lines.append(movieWordCount)
z = z + 1
#Close the files
OpenMovieList.close()
OpenPos.close()
x += 1
#for line2 in OpenIFileList.readlines():
#for line in open('myfile','r').readlines():
#do_something(line)
#Save results
#Close the File List
OpenIFileList.close()
def loadFile(self, sFilename):
'''Given a file name, return the contents of the file as a string.'''
f = open(sFilename, "r")
sTxt = f.read()
f.close()
return sTxt
def save(self, dObj, sFilename):
'''Given an object and a file name, write the object to the file using pickle.'''
f = open(sFilename, "w")
p = pickle.Pickler(f)
p.dump(dObj)
f.close()
def load(self, sFilename):
'''Given a file name, load and return the object stored in the file.'''
f = open(sFilename, "r")
u = pickle.Unpickler(f)
dObj = u.load()
f.close()
return dObj
def tokenize(self, sText):
'''Given a string of text sText, returns a list of the individual tokens that
occur in that string (in order).'''
lTokens = []
sToken = ""
for c in sText:
if re.match("[a-zA-Z0-9]", str(c)) != None or c == "\'" or c == "_" or c == '-':
sToken += c
else:
if sToken != "":
lTokens.append(sToken)
sToken = ""
if c.strip() != "":
lTokens.append(str(c.strip()))
if sToken != "":
lTokens.append(sToken)
return lTokens

To open a file for writing, you can use
with open('PosList', 'w') as Open_Pos
As you are using the with form, you do not need to close the file; Python will do that for you at the end of the with-block.
So assuming that the way you add data to the lines variable is correct, you could remove the superfluous code OpenMovieList.close() and OpenPos.close(), and append 2 lines to your code:
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
else:
print "not found"
lines.append(movieWordCount)
with open("PosList", "w") as OpenPos:
OpenPos.write(lines)

Extracting data from a text file to an output file

I have alot of files which names are just number. (Starting from 1 to whatever is the maximum number) and each of these files are similar to each other by their "tags" (ObjectID =, X =, Y =, etc.), but the values after those tags are not the same at all.
I wanted to make my job easier from manually copy/pasting the data from one file to another and made a small script using Python (since I am slightly experienced in it).
This is the full script:
import os
BASE_DIRECTORY = 'C:\Users\Tom\Desktop\TheServer\scriptfiles\Objects'
output_file = open('output.txt', 'w')
output = {}
file_list = []
for (dirpath, dirnames, filenames) in os.walk(BASE_DIRECTORY):
for f in filenames:
if 'txt' in str(f):
e = os.path.join(str(dirpath), str(f))
file_list.append(e)
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
if 'ObjectID =' in line:
output[f].append(line)
elif 'X =' in line:
output[f].append(line)
elif 'Y =' in line:
output[f].append(line)
tabs = []
for tab in output:
tabs.append(tab)
tabs.sort()
for tab in tabs:
for row in output[tab]:
output_file.write(row + '')
Now, everything is working fine, the output file looks like this:
ObjectID = 1216
X = -1480.500610
Y = 2610.885742
ObjectID = 970
X = -1517.210693
Y = 2522.842285
ObjectID = 3802
X = -1512.156616
Y = 2521.116210
etc.
But I don't want it to be like that (each value has a new line). I need it to do this for every file:
Read the file.
Remove the tags infront of the values.
Format a single line which will have those values in the output folder. (Let's say I want to make it look like this: "(1216,-1480.500610,2522.842285)" )
Write that line in the output folder.
Repeat for every file.
Any help please?

Hope this helps.
data = open('sam.txt', 'r').read()
>>> print data
ObjectID = 1216
X = -1480.500610
Y = 2610.885742
ObjectID = 970
X = -1517.210693
Y = 2522.842285
ObjectID = 3802
X = -1512.156616
Y = 2521.116210
>>>
Now lets do some string replacements :)
>>> data = data.replace('ObjectID =', '').replace('\nX = ', ',').replace('\nY = ', ',')
>>> print data
1216,-1480.500610,2610.885742
970,-1517.210693,2522.842285
3802,-1512.156616,2521.116210

In your loop, keep track of whether you are 'in' a record:
records = []
in_record = False
id, x, y = 0, 0, 0
for line in txtfile:
if not in_record:
if 'ObjectID =' in line:
in_record = True
id = line[10:]
elif 'X =' in line:
x = line[3:]
elif 'Y =' in line:
y = line[3:]
records.append((id, x, y))
in_record = False
Then you'll have a list of tuples which you can easily write with the csv module.

Find here a version of the loop you have generating the contents.
I rewrote it so the line contents ObjectId, X and Y are in the same line.
It looks that is what you want to do:
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
myline = ''
if 'ObjectID =' in line:
pos = line.rfind("ObjectID =") + len("ObjectID =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
elif 'X =' in line:
pos = line.rfind("X =") + len("X =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
elif 'Y =' in line:
pos = line.rfind("Y =") + len("Y =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
output[f].append(myline)
Note that you need to know which character (in the code the delimiter) separates the names you try to find: ObjectID = from the actual values you want to grab from the line.

Here is what you need. I did not have enough time to write the code for appending the result to a new file. Instead it just prints it, but you get the point.
import os.path
path = "path"
#getting the number of files in your folder
num_files = len([f for f in os.listdir(path)
if os.path.isfile(os.path.join(path, f))])
#function that returns your desired output for a given file
def file_head_ext(file_path, file_num):
with open(file_path + "/" + file_num) as myfile:
head = [next(myfile).split("=") for x in range(3)]
formatted_head = [elm[1].replace("\n",'').replace(" ","") for elm in head]
return(",".join(formatted_head))
for filnum in range(1,num_files):
print(file_head_ext(path, str(filnum)))

Copy 'N' lines from one file to another in python?

Essentially what I am attempting to do is read 'n' number of lines from a file and then write them to a separate file. This program essentially should take a file that has 100 lines and separate that file into 50 separate files.
def main():
from itertools import islice
userfile = raw_input("Please enter the file you wish to open\n(must be in this directory): ")
file1 = open(userfile, "r+")
#print "Name: ", file1.name
#print "Closed or not", file1.closed
#print "Opening mode: ", file1.mode
#print "Softspace flag: ", file1.softspace
jcardtop = file1.read(221);
#print jcardtop
n = 2
count = 0
while True:
next_n_lines = list(islice(file1,n))
print next_n_lines
count = count + 1
fileout = open(str(count)+ ".txt", "w+")
fileout.write(str(jcardtop))
fileout.write(str(next_n_lines))
fileout.close()
break
if not next_n_lines:
break
I do have the file printing as well to show what is in the variable next_n_lines.
*['\n', "randomtext' more junk here\n"]
I would like it instead to look like
randomtext' more junk here
Is this a limitatoin of the islice function? Or am I missing a portion of the syntax?
Thanks for your time!

Where you call str() or print, you want to ''.join(next_n_lines) instead:
print ''.join(next_n_lines)
and
fileout.write(''.join(next_n_lines))
You can store the flattened string in a variable if you don't want to call join twice.

Did you mean something like this?
f = open(userfile,"r")
start = 4
n_lines = 100
for line in f.readlines()[start:(start + n_lines)]:
print line
#do stuff with line
or maybe this rough, yet effective code:
f = open(userfile,"r")
start = 4
end = start + 100
count = start
while count != end:
for line in f.readlines()[count:(count + 2)]:
fileout = open(str(count)+ ".txt", "w+")
fileout.write(str(line))
fileout.close()
count = count + 2

Pick parts from a txt file and copy to another file with python

I'm in trouble here. I need to read a file. Txt file that contains a sequence of records, check the records that I want to copy them to a new file.
The file content is like this (this is just an example, the original file has more than 30 000 lines):
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|316 #begin register
03000|SP|467
99999|33|130 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
The records that begin with 03000 and have the characters 'TO' must be written to a new file. Based on the example, the file should look like this:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Code:
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.read()
file.close()
# here I need to check if the record exists 03000 characters 'TO', if it exists, copy the recordset 00000-99999 for the new file.
I did multiple searches and found nothing to help me.
Thank you!

with open("file.txt",'r') as inFile, open("newFile.txt","w") as outFile:
outFile.writelines(line for line in inFile
if line.startswith("03000") and "TO" in line)
If you need the previous and the next line, then you have to iterate inFile in triads. First define:
def gen_triad(lines, prev=None):
after = current = next(lines)
for after in lines:
yield prev, current, after
prev, current = current, after
And then do like before:
outFile.writelines(''.join(triad) for triad in gen_triad(inFile)
if triad[1].startswith("03000") and "TO" in triad[1])

import re
pat = ('^00000\|\d+\|\d+.*\n'
'^03000\|TO\|\d+.*\n'
'^99999\|\d+\|\d+.*\n'
'|'
'^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*')
rag = re.compile(pat,re.MULTILINE)
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(rag.findall(f.read())))
For files with additional lines between lines beginning with 00000, 03000 and 99999, I didn't find simpler code than this one:
import re
pat = ('(^00000\|\d+\|\d+.*\n'
'(?:.*\n)+?'
'^99999\|\d+\|\d+.*\n)'
'|'
'(^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*)')
rag = re.compile(pat,re.MULTILINE)
pit = ('^00000\|.+?^03000\|TO\|\d+.+?^99999\|')
rig = re.compile(pit,re.DOTALL|re.MULTILINE)
def yi(text):
for g1,g2 in rag.findall(text):
if g2:
yield g2
elif rig.match(g1):
yield g1
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(yi(f.read())))

file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.readlines()
file.close()
newFile.writelines(filter(lambda x:x.startswith("03000") and "TO" in x,content))

This seems to work. The other answers seem to only be writing out records that contain '03000|TO|' but you have to write out the record before and after that as well.
import sys
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# import file
file_name = sys.argv[1]
file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name
file = open(file_path,"r")
# ---------------------------------------------------------------
# create output files
output_file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name + '.out'
output_file = open(output_file_path,"w")
# create output files
# ---------------------------------------------------------------
# process file
temp = ''
temp_out = ''
good_write = False
bad_write = False
for line in file:
if line[:5] == 'AAAAA':
temp_out += line
elif line[:5] == 'ZZZZZ':
temp_out += line
elif good_write:
temp += line
temp_out += temp
temp = ''
good_write = False
elif bad_write:
bad_write = False
temp = ''
elif line[:5] == '03000':
if line[6:8] != 'TO':
temp = ''
bad_write = True
else:
good_write = True
temp += line
temp_out += temp
temp = ''
else:
temp += line
output_file.write(temp_out)
output_file.close()
file.close()
Output:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file

Does it have to be python? These shell commands would do the same thing in a pinch.
head -1 inputfile.txt > outputfile.txt
grep -C 1 "03000|TO" inputfile.txt >> outputfile.txt
tail -1 inputfile.txt >> outputfile.txt

# Whenever I have to parse text files I prefer to use regular expressions
# You can also customize the matching criteria if you want to
import re
what_is_being_searched = re.compile("^03000.*TO")
# don't use "file" as a variable name since it is (was?) a builtin
# function
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
for this_line in source_file:
if what_is_being_searched.match(this_line):
destination_file.write(this_line)
and for those who prefer a more compact representation:
import re
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
destination_file.writelines(this_line for this_line in source_file
if re.match("^03000.*TO", this_line))

code:
fileName = '1'
fil = open(fileName,'r')
import string
##step 1: parse the file.
parsedFile = []
for i in fil:
##tuple1 = (1,2,3)
firstPipe = i.find('|')
secondPipe = i.find('|',firstPipe+1)
tuple1 = (i[:firstPipe],\
i[firstPipe+1:secondPipe],\
i[secondPipe+1:i.find('\n')])
parsedFile.append(tuple1)
fil.close()
##search criterias:
searchFirst = '03000'
searchString = 'TO' ##can be changed if and when required
##step 2: used the parsed contents to write the new file
filout = open('newFile','w')
stringToWrite = parsedFile[0][0] + '|' + parsedFile[0][1] + '|' + parsedFile[0][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
for i in range(1,len(parsedFile)):
if parsedFile[i][1] == searchString and parsedFile[i][0] == searchFirst:
for j in range(-1,2,1):
stringToWrite = parsedFile[i+j][0] + '|' + parsedFile[i+j][1] + '|' + parsedFile[i+j][2] + '\n'
filout.write(stringToWrite)
stringToWrite = parsedFile[-1][0] + '|' + parsedFile[-1][1] + '|' + parsedFile[-1][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
filout.close()
I know that this solution may be a bit long. But it is quite easy to understand. And it seems an intuitive way to do it. And I have already checked this with the Data that you have provided and it works perfectly.
Please tell me if you need some more explanation on the code. I will definitely add the same.

I tip (Beasley and Joran elyase) very interesting, but it only allows to get the contents of the line 03000. I would like to get the contents of the lines 00000 to line 99999.
I even managed to do here, but I am not satisfied, I wanted to make a more cleaner.
See how I did:
file = open(url,'r')
newFile = open("newFile.txt",'w')
lines = file.readlines()
file.close()
i = 0
lineTemp = []
for line in lines:
lineTemp.append(line)
if line[0:5] == '03000':
state = line[21:23]
if line[0:5] == '99999':
if state == 'TO':
newFile.writelines(lineTemp)
else:
linhaTemp = []
i = i+1
newFile.close()
Suggestions...
Thanks to all!

Python - file management and processing multiple zip files

I have some zip files in a folder. I have a script to process them. The data that is to be written to a database is in a different file and its structure is as follows:
some_text;database;file_name
some_text2;database2;file_name2
....
What is the best way to process this file? Also, an error message should be reported if there is no matching zip file name in that file.
My current code:
filelist = glob.glob(os.path.join(rootdir, '*.zip'))
if filelist:
for file in filelist:
print "Working on file ", file
#get only file name without .zip for compare
aa = file.split(sl)
bb = aa[len(aa) -1]
cc = bb.split(".")
ime_sole = cc[0]
fle = codecs.open(rootdir + sl + 'portal_schools.txt',
'r',encoding="cp1250")
line = fle.readline()
# Read lines
for line in iter(fle):
#print line,
a,b,c = line.split(";")
if c == ime_sole:
print c
database = str(b)
#distdir = str(c)
else:
print "some text"
return
fle.close()
But this fails because it is being read line by line. If in the first line there is no match, the code stops. I need it to continue trough the file and then, after all is done, start with a new zip file.

I know my code is far from perfect. The problem was with else. I moved it to the end of the whole code. It was a novice mistake. I also inserted try-catch so if it fails on one zip file, the next one is still processed. Now, it looks something like this:
filelist = glob.glob(os.path.join(rootdir, '*.zip'))
if filelist:
for file in filelist:
try:
aa = file.split(sl)
#print "aa ",aa
bb = aa[len(aa) -1]
#print "bb ", bb
cc = bb.split(".")
#print "cc ", cc
ime_sole = cc[0]
#print "imesole ", ime_sole
fle = codecs.open(rootdir + sl + 'portal_schools.txt','r',encoding="cp1250")
#line = fle.readline()
data = []
for line in iter(fle):
line = line.replace("\r\n", "")
x = line.split(";")
data.append(x)
result = [element for element in data if element[2] == ime_sole]
fle.close()
#print result
if result:
database = result[0][1]
vnos_data = "Podatki za %s , se vpisujejo v bazo %s " % (ime_sole, database)
host ="####"
user="####"
password = "####"
iUrnik_tables = iUrnik_tables_fromzip.Tables(defdir,file,sl,host,database,user,password)
id_skripte =iUrnik_tables[0]
date_begin = iUrnik_tables[1]
date_end = iUrnik_tables[2]
iUrnik_all_fromzip.FileWork(defdir,file,sl,host,database,user,password)
iUrnik_itt_zip.Proces(defdir,file,sl,host,database,user,password,id_skripte,date_begin,date_end)
trenutek = datetime.datetime.now()
trenutek = trenutek.strftime("%Y%m%d%H%M")
newfilename = os.path.splitext(file)[0]
newfilename = newfilename +"_" + str(trenutek) + os.path.splitext(file)[1]
folder = defdir + sl + ime_sole + sl + "archive"
destination = folder + sl
novoimezipa= destination + newfilename.split(sl)[-1]
if not os.path.exists(folder):
os.makedirs(folder)
os.chdir(folder)
shutil.copy(file,destination)
old = destination + file.split(sl)[-1]
os.rename(old , novoimezipa )
os.remove(file)
else:
nothing :)
#return
except:
print sys.exc_info()
else:
vnos_nodata= u"V mapi %s ni podatkov za prenos" % (rootdir)
Logging(defdir, sl, vnos_nodata)
I know it is not perfect but it works :)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

python: read file and split the data - python

Related

Python coding for opening and saving data to a file

Extracting data from a text file to an output file

Copy 'N' lines from one file to another in python?

Pick parts from a txt file and copy to another file with python

Python - file management and processing multiple zip files

Categories

Resources