Python: loop doing the same thing to one item n-times as opposed doing it once to n-items - python

Here is the example:
I am trying to grab a series of XML pages, and then extract data from them.
It downloads each individual page, as the while loop was designed to do but the tester() function prints the data from the first file it downloads V number of times despite it downloading and clearing the file after each time it loops through.
This is killing me what am I doing wrong?
def tester():
with open('raw.txt') as myFile:
test = linecache.getline('raw.txt', 12)
print test
test = ""
myFile.close
def grab_data(Year, rcvote):
link = "XXX/%s/roll%s.xml" % (Year, rc)
site = urllib2.urlopen(link)
localFile = open('raw.txt', 'w')
localFile.write(site.read(100000))
localFile.close()
tester()
while (V !=0):
rc = str(V)
if (len(rc) == 2):
rc = "0%s" % (rc)
elif (len(rc) == 1):
rc = "00%s" % (rc)
else:
rc = rc
grab_data(Year, rc)
V = V - 1

The problem is the linecache module. It assumes that same-named files are the same.
But why write the data to a file just to read it again anyway?
def tester(text):
line12 = text.splitlines()[11]
print line12
def grab_data(year, rcvote):
link = "XXX/%s/roll%03d.xml" % (year, rcvote)
site = urllib2.urlopen(link)
tester(site.read(100000))
while v:
grab_data(year, rc)
v -= 1

Related

Quick and dirty duplicate finder based on size and last write time only [duplicate]

This question already has answers here:
Finding duplicate files and removing them
(10 answers)
Closed 5 years ago.
Is there a simple and fast python code to identify duplicate files in a directory tree based on filesize and last write time only? (A couple false positives are OK. Forget hash, too slow and not needed to initial ID of potential real dups.)
S/O abounds with similar questions but they tend to utilize md5 or byte-by-byte comparison.
Any suggestions? Or, I need to run the code below and compare to find dup lines in the first two columns? (And maybe run hash only on the ones with matching LWT and size)?
def get_size(filename):
st = os.stat(filename)
return str(st.st_size)
def get_last_write_time(filename):
st = os.stat(filename)
convert_time_to_human_readable = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(st.st_mtime))
return convert_time_to_human_readable
LOL! Thats my code! :)))))))
Try This ( LAST UPDATE ):
import os, hashlib, time
your_target_folder = "." # change with your target folder.
def size_check(get_path):
try:
st = os.stat(get_path)
except:
return "Error"
else:
return str(st.st_size)
def md5_check(get_path):
try:
hash_md5 = hashlib.md5()
with open(get_path, "rb") as f:
for chunk in iter(lambda: f.read(2 ** 20), b""):
hash_md5.update(chunk)
except:
return "Error"
else:
return hash_md5.hexdigest()
def save_data(get_output):
with open("./data.txt", 'a') as output_data:
output_data.write(get_output)
print("Waking On All Files In Your Target Directory and Grabbing Their Personal Hashes, Plz Wait ... \n")
files_and_sizes = {}
for dirpath, _, filenames in os.walk(your_target_folder):
for items in filenames:
file_full_path = os.path.abspath(os.path.join(dirpath, items))
get_size = size_check(file_full_path)
if get_size in files_and_sizes:
files_and_sizes[get_size].append(file_full_path)
else:
files_and_sizes[get_size] = [file_full_path]
new_dict = {}
error_box = []
for key, box_name in files_and_sizes.items():
if not key == "Error" and len(box_name) > 1:
for files in box_name:
get_file_hash = md5_check(files)
if not get_file_hash == "Error":
if get_file_hash in new_dict:
new_dict[get_file_hash].append(files)
else:
new_dict[get_file_hash] = [files]
else:
error_box.append(files)
elif key == "Error" and len(box_name) > 0:
do = [error_box.append(error_files) for error_files in box_name]
else:
pass
for hashes, names in new_dict.items():
if len(names) > 1:
for each_files in names:
result = each_files + "\n"
print(result)
save_data(result)
else:
pass
if len(error_box) > 0:
print("Something Went Wrong On These Files ( I could not access them ): " + str(error_box) + "\n")
print("Good By.")
Good Luck...

Python and Comparing File Changes

d = feedparser.parse('somerssfeed/rss.xml')
message = {}
smessage = {}
for post in d.entries:
message[post.link] = post.title
fwrite = open("db.txt", "a")
for k, v in message.items():
if k in open("db.txt", "r"):
print("already exists")
else:
fwrite.write("\n" + "{0}".format(k) + "\n")
smessage[k] = v
What i want to achieve is parsing RSS feeds and write their links in to a text file. But the problem is when i run the script next time it should't return old rss items so i compare them via text file except it's failing. On the first run it writes all links, second run it should return empty because all of links are the same but it writes again the same links
EDIT:
after a whole day of trial and error this worked:
for k, v in message.items():
if k in open('db.txt').read():
print('already exists')
else:
smessage[k] = v
fwrite = open("db.txt", "a")
fwrite.write('\n{0}\n'.format(k))
fwrite.close()
You aren't using the correct syntax to open the file. Use this :
g = open("db.txt","r")
lines = xml_file.readlines()
if k in lines:
print ("already exists");

Make a diff of 2 text file quickly using python

I have 2 big text files (right now 17MB but could be GB), as such I don't want to load them in the ram because their size could exceed my ram capacity.
The code I wrote for now is this :
def stopIfFileExist(filename):
if os.path.isfile(filename):
raise Exception("%s already exist" %filename)
def compareDump(before_filename, after_filename, diff_filename):
"""
Compare 2 dumps generated via makeDump(output_filename) and generate
a file containing the differences
-before_filename : (string) filename of the first dump
-after_filename : (string) filename of the second dump
-diff_filename : (string) filename of the diff
"""
stopIfFileExist(diff_filename)
num_lines = sum(1 for line in open(after_filename))
one_percent = num_lines/float(100)
diff = []
start = time.time()
with open(after_filename, "r") as afterFile:
counter = 0
for a_line in afterFile:
print "completion : %.9f percents" %(counter/float(one_percent))
counter = counter + 1
diff.append(a_line)
with open(before_filename, "r") as beforeFile:
for b_line in beforeFile:
if a_line.rstrip() == b_line.rstrip():
diff.pop()
break
end = time.time()
print "task completed in %s seconds" %(end - start)
with open(diff_filename, "a") as diffFile:
for line in diff:
diffFile.write(line)
what I'd like to do is remove from the beforeFile a line that was sucessfully compared (eg, when the if a_line.rstrip() == b_line.rstrip(): is triggered)
However since I am currently reading the file I don't see how to do it.
Any ideas?
Thanks.
I was able to diff two 20 megabyte files in a little over 3 minutes using the following test code.
Every 10,000 lines I put a random number, which you can see diff'd in the results.
import random
import difflib
import os
import time
start = time.time()
NUM_LINES = int(10000000 / 4)
t1 = 'test1'
t2 = 'test2'
if os.path.exists(t1):
os.remove(t1)
if os.path.exists(t2):
os.remove(t2)
with open(t1, 'w+') as f1:
for number in range(1, NUM_LINES):
if number % 10000 == 0:
r = random.randint(1, number)
else:
r = 1
f1.write(str(number * r) + '\n')
else:
f1.seek(0)
with open(t2, 'w+') as f2:
for number in range(1, NUM_LINES):
if number % 10000 == 0:
r = random.randint(1, number)
else:
r = 1
f2.write(str(number * r) + '\n')
else:
f2.seek(0)
t1 = f1.readlines()
t2 = f2.readlines()
for l in difflib.unified_diff(t1, t2, lineterm=''):
print(l.strip())
print('Execution took: {:.2f} seconds'.format(time.time() - start))
I pasted the output on github, as it is obscenely long.

Python I/O multiprocessing with no Return on function

I have a working python script that, in a simplified way, works as follows:
open("A", 'r')
open("B", 'r')
open("C", 'w')
for lineA in A:
part1, part2, part3 = lineA.split(' ')
for lineB in B:
if part2 in lineB:
C.write(lineB)
I want to check in file B if a section of the line of file A exists there. If so, write that whole line from file B in a new file C.
The process is somewhat time consuming the way I have designed it (1-I still consider myself a noob with Python, 2-There are at least 4 IF statements running inside the main FOR loop), and now I have started to use input files around 200x larger than previously, so I am getting times of around 5 hours per input file here.
I have tried to use multiprocessing but I can't seem to get it to work.
I tried a simple code inside my main() function initially, without any significant improvement and definitely without using more than one CPU:
p = Process(target=multi_thread, args=(arg1,arg2,arg3))
p.start()
p.join()
Then I tried the jobs approach:
jobs = []
for i in range(4):
p = Process(target='myfunc')
jobs.append(p)
p.start()
p.join()
And a pool example I found here in the forums, to which I added a Return statement to my main function:
def multiproc(arg1,arg2,arg3):
(...)
return lineB # example of Return statment
def main():
pool = Pool(4)
with open('file.txt', 'w') as map_file:
# chunk the work into batches of 4 lines at a time
results = pool.map(multi_thread, map_file, 4)
if __name__ == "__main__":
main()
The jobs approach actually created the file and then restarted 3 more times the whole process from scratch. This last one gives me the following error:
io.UnsupportedOperation: not readable
And I also suppose that my Return statement is breaking my loop...
Any suggestions to enable multiprocessing for this piece of code, or also to improve its neatness?
Thanks!
EDIT:
As requested, here is the full messy code:
#!/usr/bin/python3
__author__ = 'daniel'
import os
import re
from multiprocessing import Process
from multiprocessing import Pool
import time
start_time = time.time()
def multi_thread(filePath, datasetFolder, mapFileDataset):
fout = open('outdude.txt', 'w')
cwd = os.getcwd()
cwdgen, sep, id = filePath.rpartition('/')
dataset = datasetFolder.rsplit("/",1)
dataset = dataset[1]
## Create file
for i in os.listdir(cwd):
if ".ped" in i:
sample_id, sep, rest = i.partition('.ped')
for i in os.listdir(cwd):
if sample_id+'.pileupgatk' in i and dataset in i:
pileup4map = open(i,'r')
snpcounter = sum(1 for _ in pileup4map)-1
pileup4map.seek(0)
mapout = open(sample_id+'.map', 'w')
counter = 1
for line in pileup4map:
if counter <= snpcounter:
mapFileData = open(datasetFolder+'/'+mapFileDataset,'r')
line = line.rstrip()
chro, coord, refb, rbase, qual = line.split(' ')
chrom = chro.strip("chr")
counter+=1
for ligna in mapFileData:
if coord in ligna:
k = re.compile(r'(?=%s )' % coord, re.I)
lookAhead = k.search(ligna)
k = re.compile(r'(?<= %s)' % coord, re.I)
lookBehind = k.search(ligna)
if lookAhead and lookBehind != None:
lignaChrom = ligna[:2].rstrip(' ')
if chrom == lignaChrom:
lignaOut = ligna.rstrip()
mapout.write(lignaOut+'\n')
## For POOL
return lignaOut
else:
pass
else:
pass
else:
pass
mapout.close()
def main():
#Multiproc
# p = Process(target=multi_thread, args=('/home/full_karyo.fa', '/home/haak15', 'dataPP.map'))
# p.start()
# p.join()
# print("--- %s seconds ---" % (time.time() - start_time))
#Jobs
# jobs = []
# for i in range(4):
# p = Process(target=multi_thread, args=('/home/full_karyo.fa', '/home/haak15', 'dataPP.map'))
# jobs.append(p)
# p.start()
# p.join()
#Pool
pool = Pool(4)
with open('file.txt', 'w') as map_file:
# chunk the work into batches of 4 lines at a time
results = pool.map(multi_thread, map_file, 4)
print(results)
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()
EDIT2:
Following Robert E and TheBigC's advises I re-wrote my code and it is now 13x faster, and not as confusing. I used a dictionary approach that is not as I/O hungry as the previous one, as TheBigC pointed. I am happy enough with the speed so I will leave multiprocessing aside for now. Thanks for the comments!
if makemap == True:
## Dictionary method - 13X faster
for i in os.listdir(cwd):
if ".ped" in i:
sample_id, sep, rest = i.partition('.ped')
for i in os.listdir(cwd):
if sample_id+'.pileupgatk' in i and dataset in i:
print("\n\t> Creating MAP file from sample: "+sample_id)
pileup4map = open(i,'r')
snpcounter = sum(1 for _ in pileup4map)-1
pileup4map.seek(0)
counter = 1
piledic = {}
for line in pileup4map:
if counter <= snpcounter:
line = line.rstrip()
#chr21 43805965 G G G
chro, coord, refb, rbase, qual = line.split(' ')
chrom = chro.strip("chr")
piledic[chrom,coord]=int(counter)
counter += 1
pileup4map.close()
mapFileData = open(datasetFolder+'/'+mapFileDataset,'r')
mapDic = {}
counterM =1
for ligna in mapFileData:
#22 Affx-19821577 0.737773 50950707 A G
chroMap,ident,prob,posMap,bas1,bas2 = ligna.split()
mapDic[chroMap,posMap]=int(counterM)
counterM +=1
listOfmatches = []
for item in piledic:
if item in mapDic:
listOfmatches.append(mapDic[item])
listOfmatches.sort()
mapWrite = open(sample_id+".map", 'w')
mapFileData.seek(0)
lineCounter = 1
for lignagain in mapFileData:
if lineCounter in listOfmatches:
mapWrite.write(lignagain)
lineCounter +=1
mapWrite.close()
mapFileData.close()

Python simple Multi-threaded Downloader file corrupted

This is my first post. I have been doing python programming for quite sometime and recently was working on a multi-threaded downloader. But the problem is that my file (jpg is my target ) gets corrupted . Also with the followinf input : http://www.aumathletics.com/images_web/headerAUMLogo.jpg
it shows error
while with the input :
http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg
the file gets corrupted.
Here is the code:-
import os, sys, requests
import threading
import urllib2
import time
URL = sys.argv[1]
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=5):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = "image.jpg"
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = urllib2.urlopen(req).read()
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print 'done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
)
print "--- %s seconds ---" % str(time.time() - start_time)
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
if __name__ == '__main__':
main(URL)
the indentation here might be wrong so here is the code pastebin(dot)com/wGEkp878
I would be very grateful if someone could point the error
EDIT: suggested by a guy
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
first = i if i == 0 else buildRange().start(i, value, numsplits)
second = buildRange().end(i, value, numsplits)
lst.append("{}-{}".format(first, second))
return lst
can anyone tell me hoe to keep the part files downloaded with names like part1 part2 so on
It turns out the file must be opened in binary mode, with 'wb' instead of 'w'. If opened with just 'w' a bunch of extra characters will be written. This has something to do with derpy windows vs. linux new line semantics. If you use 'wb' it will write exactly what you put in into the file.
EDIT:
If you want to store the individual file parts you can change
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
To
# reassemble file in correct order
for _idx,chunk in sorted(dataDict.iteritems()):
with open(fileName + str(".part-") + str(_idx), 'wb') as fh:
fh.write(chunk)
print "Finished Writing file %s" % fileName
#print 'file size {} bytes'.format(os.path.getsize(fileName))

Categories