BeautifulSoup html parser taking time to parse html file - python

I'm trying to get the results from html file using BeautifulSoup:
with open(r'/home/maria/Desktop/iqyylog.html', "r") as f:
page = f.read()
soup = BeautifulSoup(page, 'html.parser')
for tag in soup.find_all('details'):
print tag
The problem here is basically iqyylog.html file contains more than 2500 nodes. While parsing, it is taking time to load the data. Is there any other way to parse HTML file with large data. When I'm using lxml parser it is taking only first 25 nodes.

Try this.
from simplified_scrapy import SimplifiedDoc, utils
html = utils.getFileContent(r'test.html')
doc = SimplifiedDoc(html)
details = doc.selects('details')
for detail in details:
print(detail.tag)
If you still have problems, try the following.
import io
from simplified_scrapy import SimplifiedDoc, utils
def getDetails(fileName):
details = []
tag = 'details'
with io.open(fileName, "r", encoding='utf-8') as file:
# Suppose the start and end tags are not on the same line, as shown below
# <details>
# some words
# </details>
line = file.readline() # Read data line by line
stanza = None # Store a details node
while line != '':
if line.strip() == '':
line = file.readline()
continue
if stanza and line.find('</' + tag + '>') >= 0:
doc = SimplifiedDoc(stanza + '</' + tag + '>') # Instantiate a doc
details.append(doc.select(tag))
stanza = None
elif stanza:
stanza = stanza + line
else:
if line.find('<' + tag) >= 0:
stanza = line
line = file.readline()
return details
details = getDetails('test.html')
for detail in details:
print(detail.tag)

Related

Python - why the print result is repeated and "write to a text" only has one line

Lovely people! I'm totally new with Python. I tried to scrape several URLs and encountered a problem with "print".
I tried to print and write the "shipment status".
I have two URLs, so ideally I get two results.
This is my code:
from bs4 import BeautifulSoup
import re
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
for p in shipment:
# extract information
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
import sys
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
I have two problems here:
Problem one: I have only two URLs, and when I print the results, every "span" is repeated 4 times (as there are four "span"s).
The result in the "output" is as below:
(I deleted the result example to protect privacy.)
Problem two: I tried to write the "print" to a text file, but only one line appeared in the file:
(I deleted the result example to protect privacy.)
I want to know what is wrong in the code. I want to print 2 url results only.
Your help is really appreciated!
Thank you in advance!
First point is caused by iterating over shipment - Just delete the for loop and correct indent of print():
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
Second issue is caused while you call the writing outside the loop and not in append mode - You will end up with this as your loop:
#open file in append mode
with open('somefile.txt', 'a') as f:
#start iterating your urls
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
#create output text
line = f'{url};Preparation{Preparation.getText()};Sent{Sent.getText()};InTransit{InTransit.getText()};Delivered{Delivered.getText()}'
#print output text
print (line)
#append output text to file
f.write(line+'\n')
And you can delete:
import sys
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
Example of a bit optimized code:
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = "randomfile.txt"
with open('somefile.txt', 'a', encoding='utf-8') as f:
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = list(soup.select_one('#progress').stripped_strings)
line = f"{url},{';'.join([':'.join(x) for x in list(zip(shipment[::2], shipment[1::2]))])}"
print (line)
f.write(line+'\n')
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = 'randomfile.txt'
sys.stdout = open(file_path, "w")
There are four spans actuelly, try this
for url in line_in_list:
soup = BeautifulSoup(urlopen(url).read(), 'html')
# parse something special in the file
shipments = soup.find_all("span") # there are four span actually;
sys.stdout.write('Url '+url+'; Preparation'+shipments[0].getText()+'; Sent'+shipments[1].getText()+'; InTransit'+shipments[2].getText()+'; Delivered'+shipments[3].getText())
# change line
sys.stdout.write("\r")
First question
You have two nested loops :
for url in line_in_list:
for p in shipment:
print(...)
The print is nested in the second loop. If you have 4 shipments per url, that will lead to 4 prints per url.
Since you don't use p from for p in shipment you can completely get rid of the second loop and move the print one indentation level left, like this :
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'html')
# parse something special in the file
shipment = soup.find_all('span')
Preparation=shipment[0]
Sent=shipment[1]
InTransit=shipment[2]
Delivered=shipment[3]
print (url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())
Second question
sys.stdout = open(file_path, "w")
print(url,';',"Preparation",Preparation.getText(),";","Sent",Sent.getText(),";","InTransit",InTransit.getText(),";","Delivered",Delivered.getText())`
Without keyword argument, print is writing to sys.stdout, which is by default your terminal output. There's only one print after sys.sdtout = ... so there will only be one line written to the file.
There's another way to print to a file :
with open('demo.txt', 'a') as f:
print('Hello world', file = f)
The keyword with will ensure the file is closed even if an exception is raised.
Both combined
From what I understood, you want to print two lines to the file. Here's a solution :
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
import urllib
# read urls of websites from text file
list_open = open("c:/Users/***/Downloads/web list.txt")
read_list = list_open.read()
line_in_list = read_list.split("\n")
file_path = "randomfile.txt"
for url in line_in_list:
soup = BeautifulSoup(urllib.request.urlopen(url).read(), "html")
# parse something special in the file
shipment = soup.find_all("span")
Preparation = shipment[0]
Sent = shipment[1]
InTransit = shipment[2]
Delivered = shipment[3]
with open(file_path, "a") as f:
f.write(
f"{url} ; Preparation {Preparation.getText()}; Sent {Sent.getText()}; InTransit {InTransit.getText()}; Delivered {Delivered.getText()}"
)

Creating multiple text files with unique file names from scraped data

I took an introductory course in Python this semester and am now trying to do a project. However, I don't really know what code I should write to create multiple .txt files of which the title will be different for each file.
I scraped all the terms and definitions from the website http://www.hogwartsishere.com/library/book/99/. Title of the .txt file should for example be 'Aconite.txt' and the content of the file should be the title and the definition. Every term with its definition can be found in a separate p-tag and the term itself is a b-tag withing the p-tag. Can I use this to write my code?
I suppose I will need to use a for-loop for this, but I don't really know where to start. I searched StackOverflow and found several solutions, but all of them contain code I am not familiar with and/or relate to another issue.
This is what I have so far:
#!/usr/bin/env/ python
import requests
import bs4
def download(url):
r = requests.get(url)
html = r.text
soup = bs4.BeautifulSoup(html, 'html.parser')
terms_definition = []
#for item in soup.find_all('p'): #beter definiƫren
items = soup.find_all("div", {"class" : "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
#print(term)
if term.text is not 'None':
#print(term.text)
#print("\n")
term_split = term.text.split()
print(term_split)
if term.text != None and len(term.text) > 1:
if '-' in term.text.split():
print(term.text)
print('\n')
if item.find('p'):
terms_definition.append(item['p'])
print(terms_definition)
return terms_definition
def create_url(start, end):
list_url = []
base_url = 'http://www.hogwartsishere.com/library/book/99/chapter/'
for x in range(start, end):
list_url.append(base_url + str(x))
return list_url
def search_all_url(list_url):
for url in list_url:
download(url)
#write data into separate text files. Word in front of the dash should be title of the document, term and definition should be content of the text file
#all terms and definitions are in separate p-tags, title is a b-tag within the p-tag
def name_term
def text_files
path_write = os.path.join('data', name_term +'.txt') #'term' should be replaced by the scraped terms
with open(path_write, 'w') as f:
f.write()
#for loop? in front of dash = title / everything before and after dash = text (file content) / empty line = new file
if __name__ == '__main__':
download('http://www.hogwartsishere.com/library/book/99/chapter/1')
#list_url = create_url(1, 27)
#search_all_url(list_url)
Thanks in advance!
You can iterate over all pages (1-27) to get its content, then parse each page with bs4 and then save results to files:
import requests
import bs4
import re
for i in range(1, 27):
r = requests.get('http://www.hogwartsishere.com/library/book/99/chapter/{}/'.format(i)).text
soup = bs4.BeautifulSoup(r, 'html.parser')
items = soup.find_all("div", {"class": "font-size-16 roboto"})
for item in items:
terms = item.find_all("p")
for term in terms:
title = re.match('^(.*) -', term.text).group(1).replace('/', '-')
with open(title + '.txt', 'w', encoding='utf-8') as f:
f.write(term.text)
Output files:

Scrape a MediaWiki website (specific html tags) using Python

I would like to scrape this specific MediaWiki website with specific tags. Here is my current code.
import urllib.request
from bs4 import BeautifulSoup
url = "https://wiki.sa-mp.com/wiki/Strfind"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, "html.parser")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
print(text)
If you look at the URL, there is the description, parameters, return values and the example usage. That's what I would like to scrape. Thank you!
There may be a more efficient way to do this but the following uses css selectors to grab that information
from bs4 import BeautifulSoup
import requests as re
url ="https://wiki.sa-mp.com/wiki/Strfind"
response = re.get(url)
soup = BeautifulSoup(response.content, "lxml")
description = soup.select_one('.description').text
initial_parameters = soup.select('.parameters,.param')
final_parameters = [parameter.text for parameter in initial_parameters]
returnValues = soup.select_one('#bodyContent > .param + p + div').text
exampleUsage = soup.select_one('.pawn').text
results = [description,final_parameters,returnValues,exampleUsage]
print(results)

python: extracting text from any website

so far i have done my work but it successfully getting text from these two websites :
http://www.tutorialspoint.com/cplusplus/index.htm
http://www.cplusplus.com/doc/tutorial/program_structure/
But I don't know where I am doing wrong and it is not getting text from other websites and it's is giving me error when i place other links such as:
http://www.cmpe.boun.edu.tr/~akin/cmpe223/chap2.htm
http://www.i-programmer.info/babbages-bag/477-trees.html
http://www.w3schools.com/html/html_elements.asp
Error:
Traceback (most recent call last):
File "C:\Users\DELL\Desktop\python\s\fyp\data extraction.py", line 20, in
text = soup.select('.C_doc')[0].get_text()
IndexError: list index out of range
My code:
import urllib
from bs4 import BeautifulSoup
url = "http://www.i-programmer.info/babbages-bag/477-trees.html" #unsuccessfull
#url = "http://www.tutorialspoint.com/cplusplus/index.htm" #doing successfully
#url = "http://www.cplusplus.com/doc/tutorial/program_structure/" #doing successfully
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# kill all script and style elements
for script in soup(["script", "style","a","<div id=\"bottom\" >"]):
script.extract() # rip it out
# get text
#text = soup.select('.C_doc')[0].get_text()
#text = soup.select('.content')[0].get_text()
if soup.select('.content'):
text = soup.select('.content')[0].get_text()
else:
text = soup.select('.C_doc')[0].get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
print text
fo = open('foo.txt', 'w')
fo.seek(0, 2)
line = fo.writelines( text )
fo.close()
#writing done :)
Try using
Text = soup.findAll(text=True)
UPDATE
This is a basic text stripper you can start from.
import urllib
from bs4 import BeautifulSoup
url = "http://www.i-programmer.info/babbages-bag/477-trees.html"
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
for script in soup(["script", "style","a","<div id=\"bottom\" >"]):
script.extract()
text = soup.findAll(text=True)
for p in text:
print p
You are assuming all websites that you scrap has class name content OR C_doc.
What if the website you scrap does not have such class name C_doc?
Here is the fix:
text = ''
if soup.select('.content'):
text = soup.select('.content')[0].get_text()
elif soup.select('.C_doc'):
text = soup.select('.C_doc')[0].get_text()
if text:
#put rest of the code.
else:
print 'text does not exists.'

Python HTML parsing script that takes array of URLs and outputs specific data about each of the URLs

I am trying to write an HTML parser in Python that takes as its input a URL or list of URLs and outputs specific data about each of those URLs in the format:
URL: data1: data2
The data points can be found at the exact same HTML node in each of the URLs. They are consistently between the same starting tags and ending tags. If anyone out there would like to help an amateur python programmer get the job done, it would be greatly appreciated. Extra points if you can come up with a way to output the information that can be easily copied and pasted into an excel document for subsequent data analysis!
For example, lets say I would like to output the view count for a particular YouTube video. For the URL http://www.youtube.com/watch?v=QOdW1OuZ1U0, the view count is around 3.6 million. For all YouTube videos, this number is found in the following format within the page's source:
<span class="watch-view-count ">
3,595,057
</span>
Fortunately, these exact tags are found only once on a particular YouTube video's page. These starting and ending tags can be inputted into the program or built-in and modified when necessary. The output of the program would be:
http://www.youtube.com/watch?v=QOdW1OuZ1U0: 3,595,057 (or 3595057).
import urllib2
from bs4 import BeautifulSoup
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
soup = BeautifulSoup(data)
span = soup.find('span', attrs={'class':'watch-view-count'})
print '{}:{}'.format(url, span.text)
If you do not want to use BeautifulSoup, you can use re:
import urllib2
import re
url = 'http://www.youtube.com/watch?v=QOdW1OuZ1U0'
f = urllib2.urlopen(url)
data = f.read()
pattern = re.compile('<span class="watch-view-count.*?([\d,]+).*?</span>', re.DOTALL)
r = pattern.search(data)
print '{}:{}'.format(url, r.group(1))
As for the outputs, I think you can store them in a csv file.
I prefer HTMLParser over re for this type of task. However, HTMLParser can be a bit tricky. I use immutable objects to store data... I'm sure this this the wrong way of doing it. But its worked with several projects for me in the past.
import urllib2
from HTMLParser import HTMLParser
import csv
position = []
results = [""]
class hp(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'span' and ('class', 'watch-view-count ') in attrs:
position.append('bingo')
def handle_endtag(self, tag):
if tag == 'span' and 'bingo' in position:
position.remove('bingo')
def handle_data(self, data):
if 'bingo' in position:
results[0] += " " + data.strip() + " "
my_pages = ["http://www.youtube.com/watch?v=QOdW1OuZ1U0"]
data = []
for url in my_pages:
response = urllib2.urlopen(url)
page = str(response.read())
parser = hp()
parser.feed(page)
data.append(results[0])
# reinitialize immutiable objects
position = []
results = [""]
index = 0
with open('/path/to/test.csv', 'wb') as f:
writer = csv.writer(f)
header = ['url', 'output']
writer.writerow(header)
for d in data:
row = [my_pages[index], data[index]]
writer.writerow(row)
index += 1
Then just open /path/to/test.csv in Excel

Categories