I am using BeautifulSoup to parse a webpage of poetry. The poetry is separated into h3 for poem title, and .line for each line of the poem. I can get both elements and add them to a list. But I want to manipulate the h3 to be uppercase and indicate a line break, then insert it into the lines list.
linesArr = []
for lines in full_text:
booktitles = lines.select('h3')
for booktitle in booktitles:
linesArr.append(booktitle.text.upper())
linesArr.append('')
for line in lines.select('h3, .line'):
linesArr.append(line.text)
This code appends all book titles to the beginning of the list, then continues getting the h3 and .line items. I have tried inserting code like this:
linesArr = []
for lines in full_text:
for line in lines.select('h3, .line'):
if line.find('h3'):
linesArr.append(line.text.upper())
linesArr.append('')
else:
linesArr.append(line.text)
I'm not sure of what you are trying to do, but here with this way you can get an array with the title in upper case and all your line:
#!/usr/bin/python3
# coding: utf8
from bs4 import BeautifulSoup
import requests
page = requests.get("https://quod.lib.umich.edu/c/cme/CT/1:1?rgn=div2;view=fulltext")
soup = BeautifulSoup(page.text, 'html.parser')
title = soup.find('h3')
full_lines = soup.find_all('div',{'class':'line'})
linesArr = []
linesArr.append(title.get_text().upper())
for line in full_lines:
linesArr.append(line.get_text())
# Print full array with the title and text
print(linesArr)
# Print text here with line break
for linea in linesArr:
print(linea + '\n')
Related
Complete newbie but I've managed to successfully scrape EAN numbers with Python from a list of links created by an upstream piece of code. However, my output file contains all the scraped numbers as a continuous single line instead of one EAN per line.
Here's my code - what's wrong with it? (scraped URL redacted)
import requests
from bs4 import BeautifulSoup
import urllib.request
import os
subpage = 1
while subpage <= 2:
URL = "https://..." + str(subpage)
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
"""writes all links under the h2 tag into a list"""
links = []
h2s = soup.find_all("h2")
for h2 in h2s:
links.append("http://www.xxxxxxxxxxx.com" + h2.a['href'])
"""opens links from list and extracts EAN number from underlying page"""
with open("temp.txt", "a") as output:
for link in links:
urllib.request.urlopen(link)
page_2 = requests.get(link)
soup_2 = BeautifulSoup(page_2.content, "html.parser")
if "EAN:" in soup_2.text:
span = soup_2.find(class_="articleData_ean")
EAN = span.a.text
output.write(EAN)
subpage += 1
os.replace('temp.txt', 'EANs.txt')
output.write(EAN) is writing each EAN without anything between them. It doesn't automatically add a separator or newline. You can add a newline: output.write('\n') or comma, etc. to separate them
I would like to scrape this specific MediaWiki website with specific tags. Here is my current code.
import urllib.request
from bs4 import BeautifulSoup
url = "https://wiki.sa-mp.com/wiki/Strfind"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, "html.parser")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
print(text)
If you look at the URL, there is the description, parameters, return values and the example usage. That's what I would like to scrape. Thank you!
There may be a more efficient way to do this but the following uses css selectors to grab that information
from bs4 import BeautifulSoup
import requests as re
url ="https://wiki.sa-mp.com/wiki/Strfind"
response = re.get(url)
soup = BeautifulSoup(response.content, "lxml")
description = soup.select_one('.description').text
initial_parameters = soup.select('.parameters,.param')
final_parameters = [parameter.text for parameter in initial_parameters]
returnValues = soup.select_one('#bodyContent > .param + p + div').text
exampleUsage = soup.select_one('.pawn').text
results = [description,final_parameters,returnValues,exampleUsage]
print(results)
I've scraped the website for my research but I couldn't find the right way to extract it into data frame. I believe that my problem is related with list objects that are between lines 36 and 38.
The print line has worked very nice that I can see the final version of data frame in the Python console.
The solution can be really easy but I couldn't figure it out. Thanks in advance for all help.
from time import sleep
from bs4 import BeautifulSoup, SoupStrainer
import requests
import pandas as pd
# Insert the hisghest page number for website
highest_number = 12
def total_page_number(url):
all_webpage_links = []
all_webpage_links.insert(0, url)
pages = [str(each_number) for each_number in range(2, highest_number)]
for page in pages:
link = ''.join(url + '&page=' + page)
all_webpage_links.append(link)
return all_webpage_links
# Use total_page_number function to create page list for website
All_page = total_page_number(
'https://www.imdb.com/search/title?countries=tr&languages=tr&locations=Turkey&count=250&view=simple')
def clean_text(text):
""" Removes white-spaces before, after, and between characters
:param text: the string to remove clean
:return: a "cleaned" string with no more than one white space between
characters
"""
return ' '.join(text.split())
# Create list objects for data
# Problem occurs in this line !!!!!!
actor_names = []
titles = []
dates = []
def get_cast_from_link(movie_link):
""" Go to the IMDb Movie page in link, and find the cast overview list.
Prints tab-separated movie_title, actor_name, and character_played to
stdout as a result. Nothing returned
:param movie_link: string of the link to IMDb movie page (http://imdb.com
...)
:return: void
"""
movie_page = requests.get(movie_link)
# Use SoupStrainer to strain the cast_list table from the movie_page
# This can save some time in bigger scraping projects
cast_strainer = SoupStrainer('table', class_='cast_list')
movie_soup = BeautifulSoup(movie_page.content, 'html.parser', parse_only=cast_strainer)
# Iterate through rows and extract the name and character
# Remember that some rows might not be a row of interest (e.g., a blank
# row for spacing the layout). Therefore, we need to use a try-except
# block to make sure we capture only the rows we want, without python
# complaining.
for row in movie_soup.find_all('tr'):
try:
actor = clean_text(row.find(itemprop='name').text)
actor_names.append(actor)
titles.append(movie_title)
dates.append(movie_date)
print('\t'.join([movie_title, actor, movie_date]))
except AttributeError:
pass
# Export data frame
# Problem occurs in this line !!!!!!
tsd_df = pd.DataFrame({'Actor_Names': actor_names,
'Movie_Title': titles,
'Movie_Date': dates})
tsd_df.to_csv('/Users/ea/Desktop/movie_df.tsv', encoding='utf-8')
for each in All_page:
# Use requests.get('url') to load the page you want
web_page = requests.get(each)
# https://www.imdb.com/search/title?countries=tr&languages=tr&count=250&view=simple&page=2
# Prepare the SoupStrainer to strain just the tbody containing the list of movies
list_strainer = SoupStrainer('div', class_='lister-list')
# Parse the html content of the web page with BeautifulSoup
soup = BeautifulSoup(web_page.content, 'html.parser', parse_only=list_strainer)
# Generate a list of the "Rank & Title" column of each row and iterate
movie_list = soup.find_all('span', class_='lister-item-header')
for movie in movie_list:
movie_title = movie.a.text
movie_date = movie.find('span', class_='lister-item-year text-muted unbold').text
# get the link to the movie's own IMDb page, and jump over
link = 'http://imdb.com' + movie.a.get('href')
get_cast_from_link(link)
# remember to be nice, and sleep a while between requests!
sleep(15)
cannot get the span text within the "table", thanks !
from bs4 import BeautifulSoup
import urllib2
url1 = "url"
content1 = urllib2.urlopen(url1).read()
soup = BeautifulSoup(content1,"lxml")
table = soup.findAll("div", {"class" : "iw_component","id":"c1417094965154"})
rows = table.find_all('span',recursive=False)
for row in rows:
print(row.text)
table = soup.findAll("div", {"class" : "iw_component","id":"c1417094965154"})
In the above line, findAll() returns a list.
So, in the next line you are getting the error because its expecting an HTML string.
If you expect only one table, try using the following code. Just replace
rows = table.find_all('span',recursive=False)
with
rows = table[0].find_all('span')
If you expect multiple tables in the page, run a for loop on the table and then run the rest of the statements inside the for loop.
Also, for pretty output, you can replace the tabs with spaces as in the following code:
row = row.get_text()
row = row.replace('\t', '')
print(row)
The final working code for you is:
from bs4 import BeautifulSoup
import urllib2
url1 = "url"
content1 = urllib2.urlopen(url1).read()
soup = BeautifulSoup(content1,"lxml")
table = soup.findAll("div", {"class" : "iw_component","id":"c1417094965154"})
rows = table[0].find_all('span')
for row in rows:
row_str = row.get_text()
row_str = row_str.replace('\t', '')
print(row_str)
Regarding recursive=False parameter, if it's set to false, it will only find in direct children which, in your case will give no result.
Recursive Argument in find()
If you only want Beautiful Soup to consider direct children, you can pass in recursive=False
Here's another approach using lxml instead of beautifulsoup:
import requests
from lxml import html
req = requests.get("<URL>")
raw_html = html.fromstring(req.text)
spans = raw_html.xpath('//div[#id="c1417094965154"]//span/text()')
print("".join([x.replace("\t", "").replace("\r\n","").strip() for x in spans]))
Output: Kranji Mile Day simulcast races, Kranji Racecourse, SINClass 3 Handicap - 1200M TURFSaturday, 26 May 2018Race 1, 5:15 PM
As you see, the output need a little formatting, spans is a list of all spans text, so you can do any processing you need.
You seem to use python 2.x, here is a python 3.x solution, since I do not have a python 2.x environment at the moment :
from bs4 import BeautifulSoup
import urllib.request as urllib
url1 = "<URL>"
# Read the HTML page
content1 = urllib.urlopen(url1).read()
soup = BeautifulSoup(content1, "lxml")
# Find the div (there is only one, so you do not need findAll) -> this is your problem
div = soup.find("div", class_="iw_component", id="c1417094965154")
# Now you retrieve all the span within this div
rows = div.find_all("span")
# You can do what you want with it !
line = ""
for row in rows:
row_str = row.get_text()
row_str = row_str.replace('\t', '')
line += row_str + ", "
print(line)
so far i have done my work but it successfully getting text from these two websites :
http://www.tutorialspoint.com/cplusplus/index.htm
http://www.cplusplus.com/doc/tutorial/program_structure/
But I don't know where I am doing wrong and it is not getting text from other websites and it's is giving me error when i place other links such as:
http://www.cmpe.boun.edu.tr/~akin/cmpe223/chap2.htm
http://www.i-programmer.info/babbages-bag/477-trees.html
http://www.w3schools.com/html/html_elements.asp
Error:
Traceback (most recent call last):
File "C:\Users\DELL\Desktop\python\s\fyp\data extraction.py", line 20, in
text = soup.select('.C_doc')[0].get_text()
IndexError: list index out of range
My code:
import urllib
from bs4 import BeautifulSoup
url = "http://www.i-programmer.info/babbages-bag/477-trees.html" #unsuccessfull
#url = "http://www.tutorialspoint.com/cplusplus/index.htm" #doing successfully
#url = "http://www.cplusplus.com/doc/tutorial/program_structure/" #doing successfully
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# kill all script and style elements
for script in soup(["script", "style","a","<div id=\"bottom\" >"]):
script.extract() # rip it out
# get text
#text = soup.select('.C_doc')[0].get_text()
#text = soup.select('.content')[0].get_text()
if soup.select('.content'):
text = soup.select('.content')[0].get_text()
else:
text = soup.select('.C_doc')[0].get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
print text
fo = open('foo.txt', 'w')
fo.seek(0, 2)
line = fo.writelines( text )
fo.close()
#writing done :)
Try using
Text = soup.findAll(text=True)
UPDATE
This is a basic text stripper you can start from.
import urllib
from bs4 import BeautifulSoup
url = "http://www.i-programmer.info/babbages-bag/477-trees.html"
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
for script in soup(["script", "style","a","<div id=\"bottom\" >"]):
script.extract()
text = soup.findAll(text=True)
for p in text:
print p
You are assuming all websites that you scrap has class name content OR C_doc.
What if the website you scrap does not have such class name C_doc?
Here is the fix:
text = ''
if soup.select('.content'):
text = soup.select('.content')[0].get_text()
elif soup.select('.C_doc'):
text = soup.select('.C_doc')[0].get_text()
if text:
#put rest of the code.
else:
print 'text does not exists.'