Python - file management and processing multiple zip files - python

I have some zip files in a folder. I have a script to process them. The data that is to be written to a database is in a different file and its structure is as follows:
some_text;database;file_name
some_text2;database2;file_name2
....
What is the best way to process this file? Also, an error message should be reported if there is no matching zip file name in that file.
My current code:
filelist = glob.glob(os.path.join(rootdir, '*.zip'))
if filelist:
for file in filelist:
print "Working on file ", file
#get only file name without .zip for compare
aa = file.split(sl)
bb = aa[len(aa) -1]
cc = bb.split(".")
ime_sole = cc[0]
fle = codecs.open(rootdir + sl + 'portal_schools.txt',
'r',encoding="cp1250")
line = fle.readline()
# Read lines
for line in iter(fle):
#print line,
a,b,c = line.split(";")
if c == ime_sole:
print c
database = str(b)
#distdir = str(c)
else:
print "some text"
return
fle.close()
But this fails because it is being read line by line. If in the first line there is no match, the code stops. I need it to continue trough the file and then, after all is done, start with a new zip file.

I know my code is far from perfect. The problem was with else. I moved it to the end of the whole code. It was a novice mistake. I also inserted try-catch so if it fails on one zip file, the next one is still processed. Now, it looks something like this:
filelist = glob.glob(os.path.join(rootdir, '*.zip'))
if filelist:
for file in filelist:
try:
aa = file.split(sl)
#print "aa ",aa
bb = aa[len(aa) -1]
#print "bb ", bb
cc = bb.split(".")
#print "cc ", cc
ime_sole = cc[0]
#print "imesole ", ime_sole
fle = codecs.open(rootdir + sl + 'portal_schools.txt','r',encoding="cp1250")
#line = fle.readline()
data = []
for line in iter(fle):
line = line.replace("\r\n", "")
x = line.split(";")
data.append(x)
result = [element for element in data if element[2] == ime_sole]
fle.close()
#print result
if result:
database = result[0][1]
vnos_data = "Podatki za %s , se vpisujejo v bazo %s " % (ime_sole, database)
host ="####"
user="####"
password = "####"
iUrnik_tables = iUrnik_tables_fromzip.Tables(defdir,file,sl,host,database,user,password)
id_skripte =iUrnik_tables[0]
date_begin = iUrnik_tables[1]
date_end = iUrnik_tables[2]
iUrnik_all_fromzip.FileWork(defdir,file,sl,host,database,user,password)
iUrnik_itt_zip.Proces(defdir,file,sl,host,database,user,password,id_skripte,date_begin,date_end)
trenutek = datetime.datetime.now()
trenutek = trenutek.strftime("%Y%m%d%H%M")
newfilename = os.path.splitext(file)[0]
newfilename = newfilename +"_" + str(trenutek) + os.path.splitext(file)[1]
folder = defdir + sl + ime_sole + sl + "archive"
destination = folder + sl
novoimezipa= destination + newfilename.split(sl)[-1]
if not os.path.exists(folder):
os.makedirs(folder)
os.chdir(folder)
shutil.copy(file,destination)
old = destination + file.split(sl)[-1]
os.rename(old , novoimezipa )
os.remove(file)
else:
nothing :)
#return
except:
print sys.exc_info()
else:
vnos_nodata= u"V mapi %s ni podatkov za prenos" % (rootdir)
Logging(defdir, sl, vnos_nodata)
I know it is not perfect but it works :)

Related

Python coding for opening and saving data to a file

I am having an issue getting the train function to work correctly in python. I can not modify the def function. I am at the point where I need to get the second file to read lines one at a time for PosList and i need to match the value of movieWordCount[z] in OpenPos. If the file is there, then I am good to incrment column 2 by one of t hat line (segmented by a space). If it is not, then I need the else to append it to the file end. It does not work. It does not append the values if it is missing and I am not sure if it will find the value if it is there. I have been stuck getting thsi to work for two days.
Here is my code segment I am working with:
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
#Now use tokenize to split it apart by space and set to new array for me to call column2
else:
print "not found"
lines.append(movieWordCount[z] + " 1" + "\n")
Here is my full code:
#!/usr/bin/python
#Import Counter
import collections
from collections import Counter
#Was already here but pickle is used for data input and export
import math, os, pickle, re
class Bayes_Classifier:
def __init__(self, trainDirectory = "movie_reviews/"):
#If file listing exists skip to train
if os.path.isfile('iFileList'):
print "file found"
self.train()
#self.classify()
#If file listing does not exist skip to train
if not os.path.isfile('iFileList'):
print "no file"
newfile = 'iFileList'
tempList = set()
subDir = './movie_reviews'
for filenames in os.listdir(subDir):
my_sub_path = os.path.join(os.sep,subDir,filenames)
tempList.add(filenames)
self.save("filenames", "try3")
f = []
for fFileObj in os.walk("movie_reviews/"):
f.extend(fFileObj)
break
pickle.dump(f, open( "save.p", "wb" ))
self.save(f, "try4")
with open(newfile, 'wb') as fi:
pickle.dump(tempList, fi)
#print tempList
self.train()
#self.classify()
def train(self):
'''Trains the Naive Bayes Sentiment Classifier.'''
print "File ready for training"
#Open iFileList to use as input for opening movie files
x = 0
OpenIFileList = open('iFileList','r')
print "iFileList now Open"
#Loop through the file
for line in OpenIFileList:
#print "Ready to read lines"
#print "reading line " + line
if x > 4:
if x % 2 == 0:
#print line
s = line
if '-' in s:
comp = s.split("'")
#print comp[2]
print comp[1] #This is What you need for t he movie file
compValue1 = comp[1]
#Determine Positive/Negative.
#compType is the variable I am storing it to.
compType = compValue1.split("-",2)[1]
#print compType #Prints that middle value like 5 or 1
# This will do the work based on the value.
if compType == '5':
# print "you have a five" #Confirms the loop I am in.
#If file does not exists create it
if not os.path.exists('PosList'):
print "no file"
file('PosList', 'w').close()
#Open file that needs to be reviewed for word count
compValue2 = "movie_reviews/" + compValue1
print compValue2 #Prints the directory and file path
OpenMovieList = open(compValue2,'r')
for commentLine in OpenMovieList:
commentPositive = commentLine.split(" ")
commentPositiveCounter = Counter(commentPositive)
#print commentPositiveCounter # " Comment Pos goes here"
#if commentLine != '' or commentLine != ' ':
#Get first word, second word, ....
if commentLine and (not commentLine.isspace()):
movieWordCount = self.tokenize(commentLine)
y = len(movieWordCount) #determines length of string
print y
z = 0
#print movieWordCount[0] # Shows the zero position in the file.
while z < y:
print "position " + str(z) + " word is " + movieWordCount[z] # Shows the word we are at and position id
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
else:
print "not found"
lines.append(movieWordCount)
z = z + 1
#Close the files
OpenMovieList.close()
OpenPos.close()
x += 1
#for line2 in OpenIFileList.readlines():
#for line in open('myfile','r').readlines():
#do_something(line)
#Save results
#Close the File List
OpenIFileList.close()
def loadFile(self, sFilename):
'''Given a file name, return the contents of the file as a string.'''
f = open(sFilename, "r")
sTxt = f.read()
f.close()
return sTxt
def save(self, dObj, sFilename):
'''Given an object and a file name, write the object to the file using pickle.'''
f = open(sFilename, "w")
p = pickle.Pickler(f)
p.dump(dObj)
f.close()
def load(self, sFilename):
'''Given a file name, load and return the object stored in the file.'''
f = open(sFilename, "r")
u = pickle.Unpickler(f)
dObj = u.load()
f.close()
return dObj
def tokenize(self, sText):
'''Given a string of text sText, returns a list of the individual tokens that
occur in that string (in order).'''
lTokens = []
sToken = ""
for c in sText:
if re.match("[a-zA-Z0-9]", str(c)) != None or c == "\'" or c == "_" or c == '-':
sToken += c
else:
if sToken != "":
lTokens.append(sToken)
sToken = ""
if c.strip() != "":
lTokens.append(str(c.strip()))
if sToken != "":
lTokens.append(sToken)
return lTokens
To open a file for writing, you can use
with open('PosList', 'w') as Open_Pos
As you are using the with form, you do not need to close the file; Python will do that for you at the end of the with-block.
So assuming that the way you add data to the lines variable is correct, you could remove the superfluous code OpenMovieList.close() and OpenPos.close(), and append 2 lines to your code:
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
else:
print "not found"
lines.append(movieWordCount)
with open("PosList", "w") as OpenPos:
OpenPos.write(lines)

python: read file and split the data

I have a file config and the contents are separated by space " "
cat config
/home/user1 *.log,*.txt 30
/home/user2 *.trm,*.doc,*.jpeg 10
I want to read this file,parse each line and print each field from the each line.
Ex:-
Dir = /home/user1
Fileext = *.log,*.txt
days=30
I couldn't go further than the below..
def dir():
file = open('config','r+')
cont = file.readlines()
print "file contents are %s" % cont
for i in range(len(cont)):
j = cont[i].split(' ')
dir()
Any pointers how to move further?
Your code is fine, you are just missing the last step processing each element of the splitted string, try this:
def dir():
file = open('config','r+')
cont = file.readlines()
print "file contents are %s" % cont + '\n'
elements = []
for i in range(len(cont)):
rowElems = cont[i].split(' ')
elements.append({ 'dir' : rowElems[0], 'ext' : rowElems[1], 'days' : rowElems[2] })
for e in elements:
print "Dir = " + e['dir']
print "Fileext = " + e['ext']
print "days = " + e['days']
dir()
At the end of this code, you will have all the rows processed and stored in an array of dictionaries you can easily access later.
You can write a custom function to parse each line, and then use the map function to apply that function against each line in file.readlines():
def parseLine(line):
# function to split and parse each line,
# and return the formatted string
Dir, FileExt, Days = line.split(' ')[:3]
return 'Dir = {}\nFileext = {}\nDays = {}'.format(Dir, FileExt, Days)
def dir():
with open('config','r+') as file:
print 'file contents are\n' + '\n'.join(map(parseLine, file.readlines()))
Results:
>>> dir()
file contents are
Dir = /home/user1
Fileext = *.log,*.txt
Days = 30
Dir = /home/user2
Fileext = *.trm,*.doc,*.jpeg
Days = 10

Python tarfile gzipped file bigger than sum of source files

I have a Python routine which archives file recordings into a GZipped tarball. The output file appears to be far larger than the source files, and I cannot work out why. As an example of the scale of the issue, 6GB of call recordings are generating an archive of 10GB.
There appear to be no errors in the script and the output .gz file is readable and appears OK apart from the huge size.
Excerpt from my script as follows:
# construct tar filename and open file
client_fileid = client_id + "_" + dt.datetime.now().strftime("%Y%m%d_%H%M%S")
tarname = tar_path + "/" + client_fileid + ".tar.gz"
print "Opening tar file %s " % (tarname), "\n"
try:
tar = tarfile.open (tarname, "w:gz")
except:
print "Error opening tar file: %s" % sys.exc_info()[0]
sql="""SELECT number, er.id, e.id, flow, filename, filesize, unread, er.cr_date, callerid,
length, callid, info, party FROM extension_recording er, extension e, client c
WHERE er.extension_id = e.id AND e.client_id = c.id AND c.parent_client_id = %s
AND DATE(er.cr_date) BETWEEN '%s' AND '%s'""" % (client_id, start_date, end_date)
rows = cur.execute(sql)
recordings = cur.fetchall()
if rows == 0: sys.exit("No recordings for selected date range - exiting")
for recording in recordings: # loop through recordings cursor
try:
ext_len = len(str(recording[0]))
# add preceding zeroes if the ext no starts with 0 or 00
if ext_len == 2: extension_no = "0" + str(recording[0])
elif ext_len == 1: extension_no = "00" + str(recording[0])
else: extension_no = str(recording[0])
filename = recording[4]
extended_no = client_id + "*%s" % (extension_no)
sourcedir = recording_path + "/" + extended_no
tardir = extended_no + "/" + filename
complete_name = sourcedir + "/" + filename
tar.add(complete_name, arcname=tardir) # add to tar archive
except:
print "Error '%s' writing to tar file %s" % (sys.exc_info()[1], csvfullfilename)

separate line output by groups

My python script checks mysqldump and if any problems script prints :
Dump is old for db;
Dump is not complete for db;
Dump is empty for db;
MySQL dump does not exist for db;
Script logs these records to the file line by line.
My question is there are a way to format output in the file like:
Dump is old for db;
Dump is old for db;
Dump is old for db;
Dump is not complete for db;
Dump is not complete for db;
Dump is not complete for db;
Dump is empty for db;
Dump is empty for db;
Dump is empty for db;
Because now my file looks like:
Dump is old for db;
Dump is empty for db;
Dump is old for db;
MySQL dump does not exist for db;
...
etc
Here my small script :)
#!/bin/env python
import psycopg2
import sys,os
from subprocess import Popen, PIPE
from datetime import datetime
import smtplib
con = None
today = datetime.now().strftime("%Y-%m-%d")
log_dump_fail = '/tmp/mysqldump_FAIL'
log_fail = open(log_dump_fail,'w').close()
log_fail = open(log_dump_fail, 'a')
sender = 'PUT_SENDER_NAME_HERE'
receiver = ['receiver_name']
smtp_daemon_host = 'localhost'
def db_backup_file_does_not_exist(db_backup_file):
if not os.path.exists(db_backup_file): return True
else: return False
def dump_health(last_dump_row, file_name,db):
last_row = last_dump_row.rsplit(" ")
tms = ''.join(last_row[4:5])
status = last_row[1:3]
if (status) and (tms != today):
log_fail.write("\nDB is old for "+ str(db) + str(file_name) + ", \nDump finished at " + str(''.join(tms)))
log_fail.write("\n-------------------------------------------")
elif not (status) and (tms == None):
log_fail.write("\nDump is not complete for "+str(db) + str(file_name) + " , end of file is not correct")
log_fail.write("\n-------------------------------------------")
suffixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB']
def humansize(nbytes):
if nbytes == 0: return '0 B'
i = 0
while nbytes >= 1024 and i < len(suffixes)-1:
nbytes /= 1024.
i += 1
f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
return '%s %s' % (f, suffixes[i])
def dump_size(dump_file, file_name,db):
size = os.path.getsize(dump_file)
if (size < 1024):
human_readable = humansize(size)
log_fail.write("\nDump is empty for " +str(db) + "\n" +"\t" + str (file_name)+", file size is " + str(human_readable))
log_fail.write("\n-------------------------------------------")
def report_to_noc(isubject,text):
TEXT = text
SUBJECT = subject
message = 'Subject: %s\n\n%s' % (SUBJECT, TEXT)
server = smtplib.SMTP(smtp_daemon_host)
server.sendmail(sender, receiver, message)
server.quit()
try:
con = psycopg2.connect(database='**', user='***', password='***', host='****')
cur = con.cursor()
cur.execute("""\
select ad.servicename, (select name from servers where id = ps.server_id) as servername
from packages as p, account_data as ad, package_servers as ps
where p.id=ad.package_id and
p.date_deleted IS NULL and
p.id=ps.package_id and
p.aktuel IS NULL and
p.pre_def_package_id = 4 and
p.mother_package_id !=0 and
ps.subservice_id=5 and
p.mother_package_id NOT IN (select id from packages where date_deleted IS NOT NULL)
ORDER BY servername;
""")
while (1):
row = cur.fetchone ()
if row == None:
break
db = row[0]
server_name = str(row[1])
if (''.join(server_name) == 'SKIP_THIS') or (''.join(server_name) == 'SKIP_THIS'):
continue
else:
db_backup_file = '/storage/backup/db/mysql/' + str(db) + '/current/' + str(db) + '.mysql.gz'
db_backup_file2 = '/storage/backup/' + str(''.join(server_name.split("DB"))) + '/mysql/' + str(db) + '/current/'+ str(db) + '.mysql.gz'
db_file_does_not_exist = False
db_file2_does_not_exist = False
if db_backup_file_does_not_exist(db_backup_file):
db_file_does_not_exist = True
if db_backup_file_does_not_exist(db_backup_file2):
db_file2_does_not_exist = True
if db_file_does_not_exist and db_file2_does_not_exist:
log_fail.write("\nMySQL dump does not exist for " + str(db) + "\n" + "\t" + str(db_backup_file2) + "\n" + "\t" + str(db_backup_file))
log_fail.write("\n-------------------------------------------")
continue
elif (db_file_does_not_exist) and not (db_file2_does_not_exist):
p_zcat = Popen(["zcat", db_backup_file2], stdout=PIPE)
p_tail = Popen(["tail", "-2"], stdin=p_zcat.stdout, stdout=PIPE)
dump_status = str(p_tail.communicate()[0])
dump_health(dump_status,db_backup_file2,db)
dump_size(db_backup_file2, db_backup_file2,db)
elif (db_file2_does_not_exist) and not (db_file_does_not_exist):
p_zcat = Popen(["zcat", db_backup_file], stdout=PIPE)
p_tail = Popen(["tail", "-2"], stdin=p_zcat.stdout, stdout=PIPE)
dump_status = str(p_tail.communicate()[0])
dump_health(dump_status,db_backup_file,db)
dump_size(db_backup_file,db_backup_file,db)
con.close()
except psycopg2.DatabaseError, e:
print 'Error %s' % e
sys.exit(1)
log_fail.close()
if os.path.getsize(log_dump_fail) > 0:
subject = "Not all MySQL dumps completed successfully. Log file backup:" + str(log_dump_fail)
fh = open(log_dump_fail, 'r')
text = fh.read()
fh.close()
report_to_noc(subject,text)
else:
subject = "MySQL dump completed successfullyi for all DBs, listed in PC"
text = "Hello! \nI am notifying you that I checked mysqldump files this morning.\nThere are nothing to worry about. :)"
report_to_noc(subject,text)
You can process your log file after it has been written.
One option is to read your file and sort the lines:
lines = open('log.txt').readlines()
lines.sort()
open('log_sorted.txt', 'w').write("\n".join(lines))
This won't emit an empty line between log types.
Another option is to use a Counter:
from collections import Counter
lines = open('log.txt').readlines()
counter = Counter()
for line in lines:
counter[line] += 1
out_file = open('log_sorted.txt', 'w')
for line, num in counter.iteritems():
out_file.write(line * num + "\n")
Looks like you want to group the output of the script, rather than log the info as it comes while searching.
Easiest would be to maintain 4 lists, on each for empty, not empty and so on. In the script add the db names to appropriate list instead of logging, and then dump the lists one by one into the file with appropriate prefixes("not empty for" + dbname).
For example, remove all the log_fail.write() from the functions and replace them with list.append() and write a separate function that writes to the log file as you like:
Add lists:
db_dump_is_old_list = []
db_dump_is_empty_list = []
db_dump_is_not_complete_list = []
db_dump_does_not_exist_list = []
Modify the Functions:
def dump_health(last_dump_row, file_name,db):
last_row = last_dump_row.rsplit(" ")
tms = ''.join(last_row[4:5])
status = last_row[1:3]
if (status) and (tms != today):
db_dump_is_old_list.append(str(db))
#log_fail.write("\nDB is old for "+ str(db) + str(file_name) + ", \nDump finished at " + str(''.join(tms)))
#log_fail.write("\n-------------------------------------------")
elif not (status) and (tms == None):
db_dump_is_not_complete_list.append(str(db)
#log_fail.write("\nDump is not complete for "+str(db) + str(file_name) + " , end of file is not correct")
#log_fail.write("\n-------------------------------------------")
def dump_size(dump_file, file_name,db):
size = os.path.getsize(dump_file)
if (size < 1024):
human_readable = humansize(size)
db_dump_is_empty_list.append(str(db))
#log_fail.write("\nDump is empty for " +str(db) + "\n" +"\t" + str (file_name)+", file size is " + str(human_readable))
#log_fail.write("\n-------------------------------------------")
if db_file_does_not_exist and db_file2_does_not_exist:
db_dump_does_not_exist_list.append(str(db))
#log_fail.write("\nMySQL dump does not exist for " + str(db) + "\n" + "\t" + str(db_backup_file2) + "\n" + "\t" + str(db_backup_file))
#log_fail.write("\n-------------------------------------------")
continue
And add a logger function:
def dump_info_to_log_file():
log_dump_fail = '/tmp/mysqldump_FAIL'
log_fail = open(log_dump_fail,'w').close()
log_fail = open(log_dump_fail, 'a')
for dbname in db_dump_is_old_list:
log_fail.write("Dump is Old for" + str(dbname))
log_fail.write("\n\n")
for dbname in db_dump_is_empty_list:
log_fail.write("Dump is Empty for" + str(dbname))
log_fail.write("\n\n")
for dbname in db_dump_is_not_complete_list:
log_fail.write("Dump is Not Complete for" + str(dbname))
log_fail.write("\n\n")
for dbname in db_dump_does_not_exist_list:
log_fail.write("Dump Does Not Exist for" + str(dbname))
log_fail.close()
Or you could simply log as you are doing, and then read in the file, sort and write back the file.
Thank you all for all interesting ideas.
I have really tried all options :)
To my mind:
With Counter object the pros is to few lines of code.
But cons are - many read\write operations. Log file is not big, however, I decided to decrease read(s) \ write(s)
With array the cons are to many lines of code :) but the pros is - write to the file only once.
So I implemented arrays.. :)
Thank you guys!!!

regarding re-factoring python code

so I have this python file which looks out for all the "label" tags in a XML file and does some modification with it. label is a string containing at max three lines. the code is manipulating XML file.
#1 label="Number of Packets Transmitted by the Source
Node of the Path to the Destination Node Of
the Path"
#2 label="Number of Packets Transmitted by the Source
node of the path to the destination node of
the path"
notice in label #2 words in second and third line are not in upper case which is not what I want. I want help in correcting logic of my program such that I should not write label twice.
import os
from io import StringIO, BytesIO
def splitAndMakeTitleCase(line):
# does something not relevant to context
fileList = open("AllFiles")
for fileStr in fileList:
fileName = fileStr.rstrip('\n')
openFile = open(fileName)
openNewFile = open(fileName+'TitleCase.xml','w')
lines = openFile.readlines()
for lineIndex in range(0,len(lines)):
line = lines[lineIndex]
skip = 0
if "label=" in line and "const" not in line:
segs = line.split('"')
if len(segs) >= 3:
pass
else:
openNewFile.write(lines[lineIndex])
secondTitleCaseLine = splitAndMakeTitleCase(lines[lineIndex + 1])
skip = lineIndex + 1
openNewFile.write(secondTitleCaseLine)
if '"' not in lines[lineIndex + 1]:
thirdTitleCaseLine = splitAndMakeTitleCase(lines[lineIndex + 2])
skip = lineIndex + 1
openNewFile.write(thirdTitleCaseLine)
openNewFile.write(lines[lineIndex])
openFile.close()
openNewFile.close()
#cmd = "mv " + fileName + "TitleCase.xml " + fileName
#os.system(cmd)
In your for loop you have the first if and then within that you do some printing to the file. Then after that you do another print of the line to the file. I think that you probably want that last line in a else like this:
for fileStr in fileList:
fileName = fileStr.rstrip('\n')
openFile = open(fileName)
openNewFile = open(fileName+'TitleCase.xml','w')
lines = openFile.readlines()
for lineIndex in range(0,len(lines)):
line = lines[lineIndex]
skip = 0
if "label=" in line and "const" not in line:
segs = line.split('"')
if len(segs) >= 3:
pass
else:
openNewFile.write(lines[lineIndex])
secondTitleCaseLine = splitAndMakeTitleCase(lines[lineIndex + 1])
skip = lineIndex + 1
openNewFile.write(secondTitleCaseLine)
if '"' not in lines[lineIndex + 1]:
thirdTitleCaseLine = splitAndMakeTitleCase(lines[lineIndex + 2])
skip = lineIndex + 1
openNewFile.write(thirdTitleCaseLine)
else:
openNewFile.write(lines[lineIndex])
openFile.close()
openNewFile.close()
#cmd = "mv " + fileName + "TitleCase.xml " + fileName
#os.system(cmd)

Categories