Check if MD5 value exists in an index file - python

I am trying to figure out a way to verifying if my code can cross-verify the existence of a url string's md5 conversion value in an index file and if yes skip the scan.
Below is my code
The url formed is converted to md5 string and then stored in a idx file once scan completes, the goal is future scans should not pickup the same url. The issue I see is if str(md5url) in line is not getting executed, probably because am not using '\n' as a suffix while adding the hash to the file. But I tried that its still not working.
Any ideas?
def computeMD5hash(string_for_hash):
m = hashlib.md5()
m.update(string_for_hash.encode('utf-8'))
return m.hexdigest()
def writefilehash(formation_URL):
fn="urlindex.idx"
try:
afile = open(fn, 'a')
afile.write(computeMD5hash(formation_URL))
afile.close()
except IOError:
print("Error writing to the index file")
fn="urlindex.idx"
try:
afile = open(fn, 'r')
except IOError:
afile = open(fn, 'w')
for f in files:
formation=repouri + "/" + f
#print(computeMD5hash(formation))
md5url=computeMD5hash(formation)
hashlist = afile.readlines()
for line in hashlist:
if str(md5url) in line:
print ("Skipping " + formation + " because its already scanned and indexed as " + line)
else:
if downloadengine(formation):
print ("Download completed " + formation)
print ("Starting to write to database..")
#writetodatabase()
print ("Writing hash value ..")
writefilehash(formation)
print("Closing..")
afile.close()

You are testing in a loop. For every line that doesn't match, you download:
line1
if hash in line:
print something
else
download
line2
if hash in line:
print something
else
download
line3
if hash in line:
print something
else
download
If the hash is in line 1, then you still download, because the hash is not in line 2 or line 3. You should not decide to download until you tested all lines.
The best way to do this is to read all the hashes in one go, into a set object (because testing for containment against a set is faster). Remove the line separators:
try:
with open(fn) as hashfile:
hashes = {line.strip() for line in hashfile}
except IOError:
# no file yet, just use an empty set
hashes = set()
then when testing new hashes use:
urlhash = computeMD5hash(formation)
if urlhash not in hashes:
# not seen before, download
# record the hash
hashes.add(urlhash)
with open(fn, 'a') as hashfile:
hashfile.write(urlhash + '\n')

Related

How do I get rid of a random new line in the middle of a line in a text file using Python?

The following code will take the contents of 'out.txt' and append it to the end of 'fixed_inv.txt' in the form of a new file, 'concat.txt' based on
a shared path.
In the 'concat.txt' file, I am getting a few rows (out of thousands) that seem to have a random new line in the middle of said line.
For instance, a line is supposed to look like:
122 abc.def.com Failed to get CIFS shares with error code -2147024891. None Non-supported share access type. 0 Unkonwn NULL bluearc Different Security Type (1), Access is denied. (1354), Pruned. Different security type (21), The inherited access control list (ACL) or access control entry (ACE) could not be built. (3713), Could not convert the name of inner file or directory (27)
But instead, I have a few looking like:
122 abc.def.com Failed to get CIFS shares with error code -2147024891. None
Non-supported share access type. 0 Unkonwn NULL bluearc Different Security Type (1), Access is denied. (1354), Pruned. Different security type (21), The inherited access control list (ACL) or access control entry (ACE) could not be built. (3713), Could not convert the name of inner file or directory (27)
I have tried to fix this in my code below, but for some reason the code runs but does not fix the issue - which is to backspace the misplaced half line back or to get rid of the random new line.
class Error:
def __init__ (self, path, message): #self = new instance of class
self.path = path
self.message = message #error message
self.matched = False #has the path from out.txt been matched to the path of fixed_inv.txt?
def open_files(file1, file2, file3):
try:
f1 = open(file1, 'r')
except IOError:
print("Can't open {}".format(file1))
return None, None, None #you can't just open one file you have to open all
else:
try:
f2 = open(file2, 'r')
except IOError:
print("Can't open {}".format(file2))
f1.close()
return None, None, None
else:
try:
f3 = open(file3, 'w')
except IOError:
print("Can't open {}".format(file3))
f1.close()
f2.close()
return None, None, None
else:
return f1, f2, f3
def concat(file1, file2, file3):
errors = {} #key: path, value: instance of class Error
f1, f2, f3 = open_files(file1, file2, file3)
prevLine = "" #NEW
if f1 is not None: #if file one is able to open...
with f1:
for line_num, line in enumerate(f1): #get the line number and line
line = line.replace("\\", "/") #account for the differences in backslashes
tokens = line.strip().split(': ') #strip white spaces, split based on ':'
if len(tokens) != 3: #if there's less than two tokens...
print('Error on line {} in file {}: Expected three tokens, but found {}'.format(line_num + 1, file1, len(tokens))) #error
else: #NEW
if line.startswith('Non-supported'): #NEW
Prevline = line
Prevline = line.strip('\n') #NEW
else:
errors[tokens[1]] = Error(tokens[1], tokens[2])
with f2:
with f3:
for line_num, line in enumerate(f2):
line = line.replace("\\", "/").strip() #account for the differences in backslashes
tokens_2 = line.strip().split('\t') #strip white spaces, split based on tab
if len(tokens_2) < 4: #if we are unable to obtain the path by now since the path should be on 3rd or 4th index
print('Error on line {} in file {}: Expected >= 4 tokens, but found {}'.format(line_num + 1, file2, len(tokens_2)))
f3.write('{}\n'.format(line))
else: #if we have enough tokens to find the path...
if tokens_2[3] in errors: #if path is found in our errors dictionary from out.txt...
line.strip('\n')
path = tokens_2[3] #set path to path found
msg = errors[path].message #set the class instance of the value to msg
errors[path].matched = True #paths have been matched
f3.write('{}\t{}\n'.format(line, msg)) #write the line and the error message to concat
else: #if path is NOT found in our errors dictionary from out.txt...
f3.write('{}\t{}\n'.format(line, 'None'))
print('Error on line {} in file {}: Path {} not matched'.format(line_num + 1, file2, tokens_2[3])) #found in fixed_inv.txt,
#but not out.txt
"""for e in errors: #go through errors
if errors[e].matched is False: #if no paths have been matched
print('Path {} from {} not matched in {}'.format(errors[e].path, file1, file2)) #found in out.txt, but not in fixed_inv
f3.write('{}\t{}\n'.format(line, 'No error present'))
def main():
file1 = 'out.txt'
file2 = 'fixed_inv.txt'
file3 = 'test_concat.txt'
concat(file1, file2, file3)
if __name__ == '__main__':
main()
Any ideas/advice would be greatly appreciated! Thank you.
try replacing newline chars before writing it.
Ex:
f3.write('{}\n'.format(line.strip().replace("\n", "")))
f3.write('{}\t{}\n'.format(line.strip().replace("\n", ""), msg.replace("\n", "")))
f3.write('{}\t{}\n'.format(line.strip().replace("\n", ""), 'None'))
If you can fix this on the output side, it will obviously be a lot easier and more robust. But if you can’t, what you’re doing is a start in the right direction. You just want to:
Use prevline + line in place of line the first time.
Set prevline = “” in successful cases.
Do the check for an incomplete line before reading an error instead of after.
Distinguish too few tokens (may be an incomplete line) from too many (definitely an error) instead of trying to treat them the same.
Possibly (depending on actual input) replace new lines with some other white space instead of nothing.
Also, you may want to wrap this logic up in a generator function that you can reuse. Something like this:
def tokenizing(lines):
prevline = ""
for line in lines:
line = prevline + line
line = line.strip_logic_goes_here()
tokens = tokenize_logic_goes_here(line)
if len(tokens) > REQUIRED_TOKENS:
raise AppropriateException()
elif len(tokens) == REQUIRED_TOKENS:
yield line, tokens
prevline = ""
else:
prevline = line
if not prevline: return
tokens = tokenize_logic_goes_here()
if len(tokens) != REQUIRED_TOKENS:
raise AppropriateException()
yield line, tokens
Then you can just write;
for line, tokens in tokenizing(f1):

replace line if found or append - python

I have text that is key-value pairs separated by '='. I would like to replace the line if the key matches. if not, i would like to append it at the bottom. I've tried several ways, including:
def split_command_key_and_value(command):
if '=' in command:
command2 = command.split('=')
return command2
def test(command, path):
command2 = split_command_key_and_value(command)
pattern = command2[0]
myfile = open(path,'r') # open file handle for read
# use r'', you don't need to replace '\' with '/'
result = open(path, 'w') # open file handle for write
for line in myfile:
line = line.strip() # it's always a good behave to strip what you read from files
if pattern in line:
line = command # if match, replace line
result.write(line) # write every line
myfile.close() # don't forget to close file handle
result.close()
I know the above is just to replace text, but it deletes the text in the file, and I can't see why. Could someone point me in the right direction?
Thanks
Update:
I'm almost there, but some of my lines have similar keys, so mutiple lines are matching when only 1 should. I've tried to incorporate a regex boundary in my loop with no luck. My code is below. Does anyone have a suggestion?
There is some text in the file that isn't key-value, so I would like to skip that.
def modify(self, name, value):
comb = name + ' ' + '=' + ' ' + value + '\n'
with open('/file/', 'w') as tmpstream:
with open('/file/', 'r') as stream:
for line in stream:
if setting_name in line:
tmpstream.write(comb)
else:
tmpstream.write(line)
I think I got it. See code below.
def modify(self, name, value):
comb = name + ' ' + '=' + ' ' + value + '\n'
mylist = []
with open('/file/', 'w') as tmpstream:
with open('/file/', 'r') as stream:
for line in stream:
a = line.split()
b = re.compile('\\b'+name+'\\b')
if len(a) > 0:
if b.search(a[0]):
tmpstream.write(comb)
else:
tmpstream.write(line)
I spoke too soon. It stops at the key-value I provide. So, it only writes one line, and doesn't write the lines that don't match.
def modify(name, value):
comb = name + ' ' + '=' + ' ' + value + '\n'
mylist = []
with open('/file1', 'w') as tmpstream:
with open('/file2', 'r') as stream:
for line in stream:
a = line.split()
b = re.compile('\\b'+name+'\\b')
if len(a) > 0:
if b.search(a[0]):
tmpstream.write(comb)
else:
tmpstream.write(line)
Can anyone see the issue?
Because when you open file for writing
result = open(path, 'w') # open file handle for write
you just erase it content. Try to write in different file and after all work done replace old file with new one. Or read all data into memory and then process it and write to file.
with open(path) as f:
data = f.read()
with open(path, 'w') as f:
for l in data:
# make job here
first of all you are reading an writing the same file ...
you could first read it all and the write line by line
with open(path,'r') as f:
myfile = f.read() # read everything in the variable "myfile"
result = open(path, 'w') # open file handle for write
for line in myfile.splitlines(): # process the original file content 1 line at a time
# as before
I strongly recommend reading python's documentation on how to read and write files.
If you open an existing file in write-mode open(path, 'w'), its content will be erased:
mode can be (...) 'w' for only writing (an existing file with the same name will be erased)
To replace a line in python you can have a look at this: Search and replace a line in a file in Python
Here is one the solutions provided there adapted to your context (tested for python3):
from tempfile import mkstemp
from shutil import move
from os import close
def test(filepath, command):
# Split command into key/value
key, _ = command.split('=')
matched_key = False
# Create a temporary file
fh, tmp_absolute_path = mkstemp()
with open(tmp_absolute_path, 'w') as tmp_stream:
with open(filepath, 'r') as stream:
for line in stream:
if key in line:
matched_key = True
tmp_stream.write(command + '\n')
else:
tmp_stream.write(line)
if not matched_key:
tmp_stream.write(command + '\n')
close(fh)
move(tmp_absolute_path, filepath)
Note that with the code above every line that matches key (key=blob or blob=key) will be replaced.

Python -- How to split headers/chapters into separate files automatically

I'm converting text directly to epub and I'm having a problem automatically splitting the HTML book file into separate header/chapter files. At the moment, the code below partially works but only creates every other chapter file. So half the header/chapter files are missing from the output. Here is the code:
def splitHeaderstoFiles(fpath):
infp = open(fpath, 'rt', encoding=('utf-8'))
for line in infp:
# format and split headers to files
if '<h1' in line:
#-----------format header file names and other stuff ------------#
# create a new file for the header/chapter section
path = os.getcwd() + os.sep + header
with open(path, 'wt', encoding=('utf-8')) as outfp:
# write html top meta headers
outfp = addMetaHeaders(outfp)
# add the header
outfp = outfp.write(line)
# add the chapter/header bodytext
for line in infp:
if '<h1' not in line:
outfp.write(line)
else:
outfp.write('</body>\n</html>')
break
else:
continue
infp.close()
The problem occurs in the second 'for loop' at the bottom of the code, when I look for the next h1 tag to stop the split. I cannot use seek() or tell() to rewind or move back one line so the program can find the next header/chapter on the next iteration. Apparently you cannot use these in python in a for loop containing an implicit iter or next object in operation. Just gives a 'can't do non-zero cur-relative seeks' error.
I've also tried the while line != ' ' + readline() combination in the code which also gives the same error as above.
Does anyone know an easy way to split HTML headers/chapters of varying lengths into separate files in python? Are there any special python modules(such as pickles) that could help make this task easier?
I'm using Python 3.4
My grateful thanks in advance for any solutions to this problem...
I ran into similar problem a while ago, here is a simplified solution:
from itertools import count
chapter_number = count(1)
output_file = open('000-intro.html', 'wb')
with open('index.html', 'rt') as input_file:
for line in input_file:
if '<h1' in line:
output_file.close()
output_file = open('{:03}-chapter'.format(next(chapter_number)), 'wb')
output_file.write(line)
output_file.close()
In this approach, the first block of text leading to the first h1 block is written into 000-intro.html, the first chapter will be written into 001-chapter.html and so on. Please modify it to taste.
The solution is a simple one: Upon encountering the h1 tag, close the last output file and open a new one.
You are looping over your input file twice, which is likely causing your problems:
for line in infp:
...
with open(path, 'wt', encoding=('utf-8')) as outfp:
...
for line in infp:
...
Each for is going to have it's own iterator, so you are going to loop over the file many times.
You might try transforming your for loop into a while so you're not using two different iterators:
while infp:
line = infp.readline()
if '<h1' in line:
with open(...) as outfp:
while infp:
line = infp.readline()
if '<h1' in line:
break
outfp.writeline(...)
Alternatively, you may wish to use an HTML parser (i.e., BeautifulSoup). Then you can do something like what is described here: https://stackoverflow.com/a/8735688/65295.
Update from comment - essentially, read the entire file all at once so you can freely move back or forward as necessary. This probably won't be a performance issue unless you have a really really big file (or very little memory).
lines = infp.readlines() # read the entire file
i = 0
while i < len(lines):
if '<h1' in lines[i]:
with open(...) as outfp:
j = i + 1
while j < len(lines):
if '<h1' in lines[j]:
break
outfp.writeline(lines[j])
# line j has an <h1>, set i to j so we detect the it at the
# top of the next loop iteration.
i = j
else:
i += 1
I eventually found the answer to the above problem. The code below does alot more that just get the file header. It also simultaneously loads two parallel list arrays with formatted file name data(with extension) and pure header name data respectively so I can use these lists to fill in the and formatted filename extension in these html files within a while loop in one hit. The code now works well and is shown below.
def splitHeaderstoFiles(dir, inpath):
count = 1
t_count = 0
out_path = ''
header = ''
write_bodytext = False
file_path_names = []
pure_header_names = []
inpath = dir + os.sep + inpath
with open(inpath, 'rt', encoding=('utf-8')) as infp:
for line in infp:
if '<h1' in line:
#strip html tags, convert to start caps
p = re.compile(r'<.*?>')
header = p.sub('', line)
header = capwords(header)
line_save = header
# Add 0 for count below 10
if count < 10:
header = '0' + str(count) + '_' + header
else:
header = str(count) + '_' + header
# remove all spaces + add extension in header
header = header.replace(' ', '_')
header = header + '.xhtml'
count = count + 1
#create two parallel lists used later
out_path = dir + os.sep + header
outfp = open(out_path, 'wt', encoding=('utf-8'))
file_path_names.insert(t_count, out_path)
pure_header_names.insert(t_count, line_save)
t_count = t_count + 1
# Add html meta headers and write it
outfp = addMainHeaders(outfp)
outfp.write(line)
write_bodytext = True
# add header bodytext
elif write_bodytext == True:
outfp.write(line)
# now add html titles and close the html tails on all files
max_num_files = len(file_path_names)
tmp = dir + os.sep + 'temp1.tmp'
i = 0
while i < max_num_files:
outfp = open(tmp, 'wt', encoding=('utf-8'))
infp = open(file_path_names[i], 'rt', encoding=('utf-8'))
for line in infp:
if '<title>' in line:
line = line.strip(' ')
line = line.replace('<title></title>', '<title>' + pure_header_names[i] + '</title>')
outfp.write(line)
else:
outfp.write(line)
# add the html tail
if '</body>' in line or '</html>' in line:
pass
else:
outfp.write(' </body>' + '\n</html>')
# clean up
infp.close()
outfp.close()
shutil.copy2(tmp, file_path_names[i])
os.remove(tmp)
i = i + 1
# now rename just the title page
if os.path.isfile(file_path_names[0]):
title_page_name = file_path_names[0]
new_title_page_name = dir + os.sep + '01_Title.xhtml'
os.rename(title_page_name, new_title_page_name)
file_path_names[0] = '01_Title.xhtml'
else:
logmsg27(DEBUG_FLAG)
os._exit(0)
# xhtml file is no longer needed
if os.path.isfile(inpath):
os.remove(inpath)
# returned list values are also used
# later to create epub opf and ncx files
return(file_path_names, pure_header_names)
#Hai Vu and #Seth -- Thanks for all your help.

Python Writing to txt error

Im trying to write different things onto a text file in a while loop but it only writes it once. I want to write something to unmigrated.txt
import urllib.request
import json
Txtfile = input("Name of the TXT file: ")
fw = open(Txtfile + ".txt", "r")
red = fw.read()
blue = red.split("\n")
i=0
while i<len(blue):
try:
url = "https://api.mojang.com/users/profiles/minecraft/" + blue[i]
rawdata = urllib.request.urlopen(url)
newrawdata = rawdata.read()
jsondata = json.loads(newrawdata.decode('utf-8'))
results = jsondata['id']
url_uuid = "https://sessionserver.mojang.com/session/minecraft/profile/" + results
rawdata_uuid = urllib.request.urlopen(url_uuid)
newrawdata_uuid = rawdata_uuid.read()
jsondata_uuid = json.loads(newrawdata_uuid.decode('utf-8'))
try:
results = jsondata_uuid['legacy']
print (blue[i] + " is " + "Unmigrated")
wf = open("unmigrated.txt", "w")
wring = wf.write(blue[i] + " is " + "Unmigrated\n")
except:
print(blue[i] + " is " + "Migrated")
except:
print(blue[i] + " is " + "Not-Premium")
i+=1
You keep overwriting opening the file with w inside the loop so you only see the last data that was written to the file, either open the file once outside the loop or open with a to append. Opening once would be the simplest approach, you can also use range instead of your while or better again just iterate over the list:
with open("unmigrated.txt", "w") as f: # with close your file automatically
for ele in blue:
.....
Also wring = wf.write(blue[i] + " is " + "Unmigrated\n") sets wring to None which is what write returns so probably not of any real use.
Lastly using a blank expect is usually never a good idea, catch the specific exceptions you expect and log or at least print when you get an error.
Using the requests library, I would break up your code doing something like:
import requests
def get_json(url):
try:
rawdata = requests.get(url)
return rawdata.json()
except requests.exceptions.RequestException as e:
print(e)
except ValueError as e:
print(e)
return {}
txt_file = input("Name of the TXT file: ")
with open(txt_file + ".txt") as fw, open("unmigrated.txt", "w") as f: # with close your file automatically
for line in map(str.rstrip, fw): # remove newlines
url = "https://api.mojang.com/users/profiles/minecraft/{}".format(line)
results = get_json(url).get("id")
if not results:
continue
url_uuid = "https://sessionserver.mojang.com/session/minecraft/profile/{}".format(results)
results = get_json(url_uuid).get('legacy')
print("{} is Unmigrated".format(line))
f.write("{} is Unmigrated\n".format(line))
I am not sure where 'legacy' fits into the code, that logic I will leave to you. You can also iterate directly over the file object so you can forget about splitting the lines into blue.
try:
with open("filename", "w") as f:
f.write("your content")
But that will overwrite all contents of the file.
Instead, if you want to append to the file use:
with open("filename", "a") as f:
If you choose to not use the with syntax, remember to close the file.
Read more here:
https://docs.python.org/2/library/functions.html#open

Search, count and add - Python

properties = ["color", "font-size", "font-family", "width", "height"]
inPath = "style.css"
outPath = "output.txt"
#Open a file for reading
file = open(inPath, 'rU')
if file:
# read from the file
filecontents = file.read()
file.close()
else:
print "Error Opening File."
#Open a file for writing
file = open(outPath, 'wb')
if file:
for i in properties:
search = i
index = filecontents.find(search)
file.write(str(index), "\n")
file.close()
else:
print "Error Opening File."
seems to work, but:
It only searches a keyword once?
Its not writing to the output file. function takes exactly 1 argument
I don't want it to print the index actually, but the number of time the keyword appears.
Many thanks
First, you want .count(search), not .find(search), if what you're looking for is # of occurrences.
Second, .write() only takes a single parameter - if you want to write a newline, you need to concatenate it first, or call .write() twice.
Third, doing for i in properties: search = i is redundant; just use the name you want in your for loop.
for search in properties:
cnt = filecontents.count(search)
file.write(str(cnt) + "\n")
from itertools import imap
properties = ("color", "font-size", "font-family", "width", "height")
inPath = "style.css"
outPath = "output.txt"
try:
#Open a file for reading
filecontents = file(inPath).read()
except Exception as exc:
print exc
else:
#Open a file for writing
with open(outPath, 'wb') as out_file:
#for property in properties:
# out_string = "%s %s\n"
# out_file.write( out_string % (
# property, filecontents.count(property)))
outfile.write('\n'.join(
imap(str, imap(filecontents.count, properties))))

Categories