Splitting a big HTML file into multiple smaller files

Splitting a big HTML file into multiple smaller files - python

I extracted a full Discord dm as an HTML file.
It's too big to open up like that, so I wanted to split it into multiple files. I found another post here that had a solution, but I can't seem to figure out what I do wrong. Since the script opens up for 1 second and just closes afterwards with no results.
Here is the code I'm using.
from __future__ import print_function
from lxml import etree, html
from io import StringIO
from pathlib import Path
parser = html.HTMLParser()
header= "<html><body>\n"
footer = "</body></html>\n"
i = 1
fi = 1
messagesPerFile = 3
file = "DMPim.html"
buffer = ""
try:
tree = html.parse(StringIO(Path(file).read_text()), parser)
try:
# target and print all <div class="collectionDiv"> elements and subelements
for element in tree.xpath('//div[#class="chatlog__message-group"]'):
buffer += etree.tostring(element, pretty_print=True).decode("utf-8")
if i % messagesPerFile == 0 and i > 0:
f = open("chat" + str(fi) + ".html", "w+")
f.write(header + buffer + footer)
f.close()
fi+=1
buffer = ""
i+=1
# if remaining elements are still in the buffer, write them out
if buffer != "":
f = open("chat" + str(fi) + ".html", "w+")
f.write(header + buffer + footer)
f.close()
except etree.XPathEvalError as details:
print ('ERROR: XPath expression', details.error_log)
except etree.XMLSyntaxError as details:
print ('ERROR: parser', details.error_log)
I'm pretty new in all this and wanted this as sort of start up project, so excuse me if I'm asking obvious things.

Related

I am trying to create a program that edits text by letting you select from a few things and changes them to one of a few options

import os, re
config_file = "jsm_gyro_config.txt"
#fptr = open(config, "w")
#text = "demo text"
#fptr.write(text)
#fptr.close()
file = open(config_file, 'r')
file-read = file.read()
for line in file-read:
if re.search(userinput, file-read):
x = re.search(userinput, file-read)
# iteminputted is what the user wants to replace
iteminputted = "ref"
startpostion = x.span[1] + 3
endpostion = startposition + len(iteminputted)
# Find out how to write to a specific location in a file that will finish this off
else:
print("Item not found")
This is what i've tried and here is my thought process as always any help is appreatated and please make it understandable for an idiot :(

To begin with, you should not use - in your variable declarations as it is actually an operator and will always be treated as such. It will attempt to subtract.
Here is the same code with that fixed and also with the input
import os, re
config_file = "jsm_gyro_config.txt"
#fptr = open(config, "w")
#text = "demo text"
#fptr.write(text)
#fptr.close()
file = open(config_file, 'r')
file_read = file.read()
file.close() # You should always close your files.
for line in file_read:
if re.search(userinput, file_read):
x = re.search(userinput, file_read)
# iteminputted is what the user wants to replace
iteminputted = input("Input what you would like to replace > ")
startpostion = x.span[1] + 3
endpostion = startposition + len(iteminputted)
# Find out how to write to a specific location in a file that will finish this off
else:
print("Item not found")
However your question is very unclear, I did the best I could.

Python - How to stop the loop

I have this where it reads a file called source1.html, source2.html, source3.html, but when it cant find the next file (because it doesnt exist) it gives me a error. there can be an x amount of sourceX.html, so i need something to say if the next sourcex.html file can not be found, stop the loop.
Traceback (most recent call last): File "main.py", line 14, in
file = open(filename, "r") IOError: [Errno 2] No such file or
directory: 'source4.html
how can i stop the script looking for the next source file?
from bs4 import BeautifulSoup
import re
import os.path
n = 1
filename = "source" + str(n) + ".html"
savefile = open('OUTPUT.csv', 'w')
while os.path.isfile(filename):
strjpgs = "Extracted Layers: \n \n"
filename = "source" + str(n) + ".html"
n = n + 1
file = open(filename, "r")
soup = BeautifulSoup(file, "html.parser")
thedata = soup.find("div", class_="cplayer")
strdata = str(thedata)
DoRegEx = re.compile('/([^/]+)\.jpg')
jpgs = DoRegEx.findall(strdata)
strjpgs = strjpgs + "\n".join(jpgs) + "\n \n"
savefile.write(filename + '\n')
savefile.write(strjpgs)
print(filename)
print(strjpgs)
savefile.close()
print "done"

use a try / except and break
while os.path.isfile(filename):
try: # try to do this
# <your code>
except FileNotFoundError: # if this error occurs
break # exit the loop
The reason your code doesn't currently work is you're checking the previous file exists in your while loop. Not the next one. Hence you could also do
while True:
strjpgs = "Extracted Layers: \n \n"
filename = "source" + str(n) + ".html"
if not os.path.isfile(filename):
break
# <rest of your code>

you can try opening file, and break out of while loop once you catch an IOError exception.
from bs4 import BeautifulSoup
import re
import os.path
n = 1
filename = "source" + str(n) + ".html"
savefile = open('OUTPUT.csv', 'w')
while os.path.isfile(filename):
try:
strjpgs = "Extracted Layers: \n \n"
filename = "source" + str(n) + ".html"
n = n + 1
file = open(filename, "r")
except IOError:
print("file not found! breaking out of loop.")
break
soup = BeautifulSoup(file, "html.parser")
thedata = soup.find("div", class_="cplayer")
strdata = str(thedata)
DoRegEx = re.compile('/([^/]+)\.jpg')
jpgs = DoRegEx.findall(strdata)
strjpgs = strjpgs + "\n".join(jpgs) + "\n \n"
savefile.write(filename + '\n')
savefile.write(strjpgs)
print(filename)
print(strjpgs)
savefile.close()
print "done"

I'll suggest you to use os.path.exists() (which returns True/False) and os.path.isfile() both.
Use with statement to open file. It is Pythonic way to open files.
with statement is best preferred among the professional coders.
These are the contents of my current working directory.
H:\RishikeshAgrawani\Projects\Stk\ReadHtmlFiles>dir
Volume in drive H is New Volume
Volume Serial Number is C867-828E
Directory of H:\RishikeshAgrawani\Projects\Stk\ReadHtmlFiles
11/05/2018 16:12 <DIR> .
11/05/2018 16:12 <DIR> ..
11/05/2018 15:54 106 source1.html
11/05/2018 15:54 106 source2.html
11/05/2018 15:54 106 source3.html
11/05/2018 16:12 0 stopReadingIfNot.md
11/05/2018 16:11 521 stopReadingIfNot.py
5 File(s) 839 bytes
2 Dir(s) 196,260,925,440 bytes free
The below Python code shows how will you read files source1.html, source2.html, source.3.html and stop if there is no more files of the form sourceX.html (where X is 1, 2, 3, 4, ... etc.).
Sample code:
import os
n = 1;
html_file_name = 'source%d.html'
# It is necessary to check if sourceX.html is file or directory.
# If it is directory the check it if it exists or not.
# It it exists then perform operation (read/write etc.) on file.
while os.path.isfile(html_file_name % (n)) and os.path.exists(html_file_name % (n)):
print "Reading ", html_file_name % (n)
# The best way (Pythonic way) to open file
# You don't need to bother about closing the file
# It will be taken care by with statement
with open(html_file_name % (n), "r") as file:
# Make sure it works
print html_file_name % (n), " exists\n";
n += 1;
Output:
H:\RishikeshAgrawani\Projects\Stk\ReadHtmlFiles>python stopReadingIfNot.py
Reading source1.html
source1.html exists
Reading source2.html
source2.html exists
Reading source3.html
source3.html exists
So based on the above logic. you can modify your code. It will work.
Thanks.

This appears to be a sequence error. Let's look at a small fragment of your code, specifically lines dealing with filename:
filename = "source" + str(n) + ".html"
while os.path.isfile(filename):
filename = "source" + str(n) + ".html"
n = n + 1
file = open(filename, "r")
You're generating the next filename before you open the file (or really, checking the old filename then opening a new one). It's a little hard to see because you're really updating n while filename holds the previous number, but if we look at them in sequence it pops out:
n = 1
filename = "source1.html" # before loop
while os.path.isfile(filename):
filename = "source1.html" # first time inside loop
n = 2
open(filename)
while os.path.isfile(filename): # second time in loop - still source1
filename = "source2.html"
n = 3
open(filename) # We haven't checked if this file exists!
We can fix this a few ways. One is to move the entire updating, n before filename, to the end of the loop. Another is to let the loop mechanism update n, which is a sight easier (the real fix here is that we only use one filename value in each iteration of the loop):
for n in itertools.count(1):
filename = "source{}.html".format(n)
if not os.path.isfile(filename):
break
file = open(filename, "r")
#...
At the risk of looking rather obscure, we can also express the steps functionally (I'm using six here to avoid a difference between Python 2 and 3; Python 2's map wouldn't finish):
from six.moves import map
from itertools import count, takewhile
numbers = count(1)
filenames = map('source{}.html'.format, numbers)
existingfiles = takewhile(os.path.isfile, filenames)
for filename in existingfiles:
file = open(filename, "r")
#...
Other options include iterating over the numbers alone and using break when isfile returns False, or simply catching the exception when open fails (eliminating the need for isfile entirely).

Python -- How to split headers/chapters into separate files automatically

I'm converting text directly to epub and I'm having a problem automatically splitting the HTML book file into separate header/chapter files. At the moment, the code below partially works but only creates every other chapter file. So half the header/chapter files are missing from the output. Here is the code:
def splitHeaderstoFiles(fpath):
infp = open(fpath, 'rt', encoding=('utf-8'))
for line in infp:
# format and split headers to files
if '<h1' in line:
#-----------format header file names and other stuff ------------#
# create a new file for the header/chapter section
path = os.getcwd() + os.sep + header
with open(path, 'wt', encoding=('utf-8')) as outfp:
# write html top meta headers
outfp = addMetaHeaders(outfp)
# add the header
outfp = outfp.write(line)
# add the chapter/header bodytext
for line in infp:
if '<h1' not in line:
outfp.write(line)
else:
outfp.write('</body>\n</html>')
break
else:
continue
infp.close()
The problem occurs in the second 'for loop' at the bottom of the code, when I look for the next h1 tag to stop the split. I cannot use seek() or tell() to rewind or move back one line so the program can find the next header/chapter on the next iteration. Apparently you cannot use these in python in a for loop containing an implicit iter or next object in operation. Just gives a 'can't do non-zero cur-relative seeks' error.
I've also tried the while line != ' ' + readline() combination in the code which also gives the same error as above.
Does anyone know an easy way to split HTML headers/chapters of varying lengths into separate files in python? Are there any special python modules(such as pickles) that could help make this task easier?
I'm using Python 3.4
My grateful thanks in advance for any solutions to this problem...

I ran into similar problem a while ago, here is a simplified solution:
from itertools import count
chapter_number = count(1)
output_file = open('000-intro.html', 'wb')
with open('index.html', 'rt') as input_file:
for line in input_file:
if '<h1' in line:
output_file.close()
output_file = open('{:03}-chapter'.format(next(chapter_number)), 'wb')
output_file.write(line)
output_file.close()
In this approach, the first block of text leading to the first h1 block is written into 000-intro.html, the first chapter will be written into 001-chapter.html and so on. Please modify it to taste.
The solution is a simple one: Upon encountering the h1 tag, close the last output file and open a new one.

You are looping over your input file twice, which is likely causing your problems:
for line in infp:
...
with open(path, 'wt', encoding=('utf-8')) as outfp:
...
for line in infp:
...
Each for is going to have it's own iterator, so you are going to loop over the file many times.
You might try transforming your for loop into a while so you're not using two different iterators:
while infp:
line = infp.readline()
if '<h1' in line:
with open(...) as outfp:
while infp:
line = infp.readline()
if '<h1' in line:
break
outfp.writeline(...)
Alternatively, you may wish to use an HTML parser (i.e., BeautifulSoup). Then you can do something like what is described here: https://stackoverflow.com/a/8735688/65295.
Update from comment - essentially, read the entire file all at once so you can freely move back or forward as necessary. This probably won't be a performance issue unless you have a really really big file (or very little memory).
lines = infp.readlines() # read the entire file
i = 0
while i < len(lines):
if '<h1' in lines[i]:
with open(...) as outfp:
j = i + 1
while j < len(lines):
if '<h1' in lines[j]:
break
outfp.writeline(lines[j])
# line j has an <h1>, set i to j so we detect the it at the
# top of the next loop iteration.
i = j
else:
i += 1

I eventually found the answer to the above problem. The code below does alot more that just get the file header. It also simultaneously loads two parallel list arrays with formatted file name data(with extension) and pure header name data respectively so I can use these lists to fill in the and formatted filename extension in these html files within a while loop in one hit. The code now works well and is shown below.
def splitHeaderstoFiles(dir, inpath):
count = 1
t_count = 0
out_path = ''
header = ''
write_bodytext = False
file_path_names = []
pure_header_names = []
inpath = dir + os.sep + inpath
with open(inpath, 'rt', encoding=('utf-8')) as infp:
for line in infp:
if '<h1' in line:
#strip html tags, convert to start caps
p = re.compile(r'<.*?>')
header = p.sub('', line)
header = capwords(header)
line_save = header
# Add 0 for count below 10
if count < 10:
header = '0' + str(count) + '_' + header
else:
header = str(count) + '_' + header
# remove all spaces + add extension in header
header = header.replace(' ', '_')
header = header + '.xhtml'
count = count + 1
#create two parallel lists used later
out_path = dir + os.sep + header
outfp = open(out_path, 'wt', encoding=('utf-8'))
file_path_names.insert(t_count, out_path)
pure_header_names.insert(t_count, line_save)
t_count = t_count + 1
# Add html meta headers and write it
outfp = addMainHeaders(outfp)
outfp.write(line)
write_bodytext = True
# add header bodytext
elif write_bodytext == True:
outfp.write(line)
# now add html titles and close the html tails on all files
max_num_files = len(file_path_names)
tmp = dir + os.sep + 'temp1.tmp'
i = 0
while i < max_num_files:
outfp = open(tmp, 'wt', encoding=('utf-8'))
infp = open(file_path_names[i], 'rt', encoding=('utf-8'))
for line in infp:
if '<title>' in line:
line = line.strip(' ')
line = line.replace('<title></title>', '<title>' + pure_header_names[i] + '</title>')
outfp.write(line)
else:
outfp.write(line)
# add the html tail
if '</body>' in line or '</html>' in line:
pass
else:
outfp.write(' </body>' + '\n</html>')
# clean up
infp.close()
outfp.close()
shutil.copy2(tmp, file_path_names[i])
os.remove(tmp)
i = i + 1
# now rename just the title page
if os.path.isfile(file_path_names[0]):
title_page_name = file_path_names[0]
new_title_page_name = dir + os.sep + '01_Title.xhtml'
os.rename(title_page_name, new_title_page_name)
file_path_names[0] = '01_Title.xhtml'
else:
logmsg27(DEBUG_FLAG)
os._exit(0)
# xhtml file is no longer needed
if os.path.isfile(inpath):
os.remove(inpath)
# returned list values are also used
# later to create epub opf and ncx files
return(file_path_names, pure_header_names)
#Hai Vu and #Seth -- Thanks for all your help.

Python Writing to txt error

Im trying to write different things onto a text file in a while loop but it only writes it once. I want to write something to unmigrated.txt
import urllib.request
import json
Txtfile = input("Name of the TXT file: ")
fw = open(Txtfile + ".txt", "r")
red = fw.read()
blue = red.split("\n")
i=0
while i<len(blue):
try:
url = "https://api.mojang.com/users/profiles/minecraft/" + blue[i]
rawdata = urllib.request.urlopen(url)
newrawdata = rawdata.read()
jsondata = json.loads(newrawdata.decode('utf-8'))
results = jsondata['id']
url_uuid = "https://sessionserver.mojang.com/session/minecraft/profile/" + results
rawdata_uuid = urllib.request.urlopen(url_uuid)
newrawdata_uuid = rawdata_uuid.read()
jsondata_uuid = json.loads(newrawdata_uuid.decode('utf-8'))
try:
results = jsondata_uuid['legacy']
print (blue[i] + " is " + "Unmigrated")
wf = open("unmigrated.txt", "w")
wring = wf.write(blue[i] + " is " + "Unmigrated\n")
except:
print(blue[i] + " is " + "Migrated")
except:
print(blue[i] + " is " + "Not-Premium")
i+=1

You keep overwriting opening the file with w inside the loop so you only see the last data that was written to the file, either open the file once outside the loop or open with a to append. Opening once would be the simplest approach, you can also use range instead of your while or better again just iterate over the list:
with open("unmigrated.txt", "w") as f: # with close your file automatically
for ele in blue:
.....
Also wring = wf.write(blue[i] + " is " + "Unmigrated\n") sets wring to None which is what write returns so probably not of any real use.
Lastly using a blank expect is usually never a good idea, catch the specific exceptions you expect and log or at least print when you get an error.
Using the requests library, I would break up your code doing something like:
import requests
def get_json(url):
try:
rawdata = requests.get(url)
return rawdata.json()
except requests.exceptions.RequestException as e:
print(e)
except ValueError as e:
print(e)
return {}
txt_file = input("Name of the TXT file: ")
with open(txt_file + ".txt") as fw, open("unmigrated.txt", "w") as f: # with close your file automatically
for line in map(str.rstrip, fw): # remove newlines
url = "https://api.mojang.com/users/profiles/minecraft/{}".format(line)
results = get_json(url).get("id")
if not results:
continue
url_uuid = "https://sessionserver.mojang.com/session/minecraft/profile/{}".format(results)
results = get_json(url_uuid).get('legacy')
print("{} is Unmigrated".format(line))
f.write("{} is Unmigrated\n".format(line))
I am not sure where 'legacy' fits into the code, that logic I will leave to you. You can also iterate directly over the file object so you can forget about splitting the lines into blue.

try:
with open("filename", "w") as f:
f.write("your content")
But that will overwrite all contents of the file.
Instead, if you want to append to the file use:
with open("filename", "a") as f:
If you choose to not use the with syntax, remember to close the file.
Read more here:
https://docs.python.org/2/library/functions.html#open

Script that reads PDF metadata and writes to CSV

I wrote a script to read PDF metadata to ease a task at work. The current working version is not very usable in the long run:
from pyPdf import PdfFileReader
BASEDIR = ''
PDFFiles = []
def extractor():
output = open('windoutput.txt', 'r+')
for file in PDFFiles:
try:
pdf_toread = PdfFileReader(open(BASEDIR + file, 'r'))
pdf_info = pdf_toread.getDocumentInfo()
#print str(pdf_info) #print full metadata if you want
x = file + "~" + pdf_info['/Title'] + " ~ " + pdf_info['/Subject']
print x
output.write(x + '\n')
except:
x = file + '~' + ' ERROR: Data missing or corrupt'
print x
output.write(x + '\n')
pass
output.close()
if __name__ == "__main__":
extractor()
Currently, as you can see, I have to manually input the working directory and manually populate the list of PDF files. It also just prints out the data in the terminal in a format that I can copy/paste/separate into a spreadsheet.
I'd like the script to work automatically in whichever directory I throw it in and populate a CSV file for easier use. So far:
from pyPdf import PdfFileReader
import csv
import os
def extractor():
basedir = os.getcwd()
extension = '.pdf'
pdffiles = [filter(lambda x: x.endswith('.pdf'), os.listdir(basedir))]
with open('pdfmetadata.csv', 'wb') as csvfile:
for f in pdffiles:
try:
pdf_to_read = PdfFileReader(open(f, 'r'))
pdf_info = pdf_to_read.getDocumentInfo()
title = pdf_info['/Title']
subject = pdf_info['/Subject']
csvfile.writerow([file, title, subject])
print 'Metadata for %s written successfully.' % (f)
except:
print 'ERROR reading file %s.' % (f)
#output.writerow(x + '\n')
pass
if __name__ == "__main__":
extractor()
In its current state it seems to just prints a single error (as in, the error message in the exception, not an error returned by Python) message and then stop. I've been staring at it for a while and I'm not really sure where to go from here. Can anyone point me in the right direction?

writerow([file, title, subject]) should be writerow([f, title, subject])
You can use sys.exc_info() to print the details of your error
http://docs.python.org/2/library/sys.html#sys.exc_info

Did you check the pdffiles variable contains what you think it does? I was getting a list inside a list... so maybe try:
for files in pdffiles:
for f in files:
#do stuff with f
I personally like glob. Notice I add * before the .pdf in the extension variable:
import os
import glob
basedir = os.getcwd()
extension = '*.pdf'
pdffiles = glob.glob(os.path.join(basedir,extension)))

Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Splitting a big HTML file into multiple smaller files - python

Related

I am trying to create a program that edits text by letting you select from a few things and changes them to one of a few options

Python - How to stop the loop

Python -- How to split headers/chapters into separate files automatically

Python Writing to txt error

Script that reads PDF metadata and writes to CSV

Categories

Resources