BeautifulSoup not letting me get the text - python

I'm looking to get all the text in tag
It gives me the text in the console, but it doesn't put it in the .txt file.
It works with body.text, but not with article.text. I don't know what to do.
import bs4 as bs
import urllib.request
#import re
sauce = urllib.request.urlopen('http://www.bodoniparavia.it/index.php/it/amministrazione-trasparente/bandi-di-gara-e-contratti.html')
soup = bs.BeautifulSoup(sauce,'lxml')
body = soup.body
article = body.find('article')
article1 = article.text
print(article1)
x = open('file.txt','w')
x.write(article1)
x.close

It seems to be working fine for me but try adding encoding = 'utf-8' to the write statement. So the code would now look like this
import bs4 as bs
import urllib.request
#import re
sauce = urllib.request.urlopen('http://www.bodoniparavia.it/index.php/it/amministrazione-trasparente/bandi-di-gara-e-contratti.html')
soup = bs.BeautifulSoup(sauce,'lxml')
body = soup.body
article = body.find('article')
article1 = article.text
print(article1)
x = open('file.txt','w',encoding = 'utf-8')
x.write(article1)
x.close()

Related

Using multiple for loop with Python Using Beautiful Soup

from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
url = "https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
data = requests.get(url)
soup = bs(data.content,"html.parser")
The code below are a test with to get 1 item.
property_overview = soup.find(class_="p24_regularListing").find(class_="p24_propertyOverview").find(class_='p24_propertyOverviewRow').find(class_='col-xs-6 p24_propertyOverviewKey').text
property_overview
Output : 'Listing Number'
The code below is what we have to get all the col-xs-6 p24_propertyOverviewKey
p24_regularListing_items = soup.find_all(class_="p24_regularListing")
for p24_propertyOverview_item in p24_regularListing_items:
p24_propertyOverview_items = p24_propertyOverview_item.find_all(class_="p24_propertyOverview")
for p24_propertyOverviewRow_item in p24_propertyOverview_items:
p24_propertyOverviewRow_items = p24_propertyOverviewRow_item.find_all(class_="p24_propertyOverviewRow")
for p24_propertyOverviewKey_item in p24_propertyOverviewRow_items:
p24_propertyOverviewKey_items = p24_propertyOverviewKey_item.find_all(class_="col-xs-6 p24_propertyOverviewKey")
p24_propertyOverviewKey_items
The code above only outputs 1 item. and not all
To put things more simply, you can use soup.select() (and via the comments, you can then use .get_text() to extract the text from each tag).
from bs4 import BeautifulSoup
import requests
resp = requests.get(
"https://www.property24.com/for-sale/woodland-hills-wildlife-estate/bloemfontein/free-state/10467/109825373"
)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
texts = []
for tag in soup.select(
# NB: this selector uses Python's implicit string concatenation
# to split it onto several lines.
".p24_regularListing "
".p24_propertyOverview "
".p24_propertyOverviewRow "
".p24_propertyOverviewKey"
):
texts.append(tag.get_text())
print(texts)

Split numbers from api with commas?

import urllib.request,
urllib.parse, urllib.error
from bs4 import BeautifulSoup
url = "https://api.monzo.com/crowdfunding-investment/total"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html)
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
if 'invested_amount' in text:
result = text.split(",")
invested = str(result[1])
investedn = invested.split(':')[1]
print(investedn)
Hi all. I’m trying to split investedn into thousands with commas. Anyone know how to do this?
Also, how can I remove the last four numbers from the string?
Thanks!
Simply use
"{:,}".format(number)
https://docs.python.org/3/library/string.html#format-specification-mini-language
e.g.
In [19]: "{:,}".format(17462233620)
Out[19]: '17,462,233,620'
Managed to fix!
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
url = "https://api.monzo.com/crowdfunding-investment/total"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html)
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
if 'invested_amount' in text:
result = text.split(",")
invested = str(result[1])
investedn = invested.split(':')[1]
plainnum = int(str(investedn)[:-4])
number = "{:,}".format(int(plainnum))
print(number)
I had messed up quite a bit, but figured it out.
Thanks!
The text you got back from that URL is not HTML. It is data encoded in JSON format, which is easy to parse:
import urllib.request
import json
url = "https://api.monzo.com/crowdfunding-investment/total"
json_text = urllib.request.urlopen(url).read()
json_text = json_text.decode('utf-8')
data = json.loads(json_text)
print(data)
print('Invested amount: {:,}'.format(data['invested_amount']))
Output:
{'invested_amount': 17529735495, 'share_price': 77145, 'shares_invested': 227231, 'max_shares': 2592520, 'max_amount': 199999955400, 'status': 'pending'}
Invested amount: 17,529,735,495
Notes
json_text was an array of bytes, not a string. That is why I decode it using a guess of UTF-8.
data is just a normal Python dictionary.
a = "17462233620"
b = ""
for i in range(len(a), 0 , -3):
b = a[i-3:i]+","+b
b = "£" + a[0:i] + b[:-1]
print(b) # Output £17,462,233,620

Removing UTF 8 encoding in python

I tried scraping the webpage for Passengers & Cargo data. I couldn't convert them into normal data, and web encoding seems to be the challenge.
The Code I used is:
from __future__ import print_function
import requests
import pandas as pd
from bs4 import BeautifulSoup
import urllib
url = "https://www.faa.gov/data_research/passengers_cargo/unruly_passengers/"
r = requests.get(url)
soup = BeautifulSoup(r.content)
links = soup.find_all("tbody")
for link in links:
print(link.text)
Output1
This prints in the format Year and Total. But when I append it to a list, the encoding ruins the data. You can see that in Output1
names = []
for link in links:
names.append(link.text)
names = map(lambda x: x.strip().encode('ascii'), names)
print(names)
Output2
The desired output should be Years and Total for me to perform analyses
You can use find_all tr and td like this:
import requests
from bs4 import BeautifulSoup
import urllib
url = "https://www.faa.gov/data_research/passengers_cargo/unruly_passengers/"
r = requests.get(url)
soup = BeautifulSoup(r.content)
links = soup.find_all("tr")
data = []
for link in links:
tds = link.find_all('td')
if tds:
data.append({'year':tds[0].text,'total':tds[1].text})
print(data)
It's worked.
Hope it helps you

BeautifulSoup Cannot Find Tag

I am trying to scrape this page and all of the other pages like it. I have been using BeautifulSoup (also have tried lxml but there have been installation issues). I am using the following code:
value = "http://www.presidency.ucsb.edu/ws/index.php?pid=99556"
desiredTag = "span"
r = urllib2.urlopen(value)
data = BeautifulSoup(r.read(), 'html5lib')
displayText = data.find_all(desiredTag)
print displayText
displayText = " ".join(str(displayText))
displayText = BeautifulSoup(displayText, 'html5lib')
For some reason this isn't pull back the <span class="displaytext"> and also I have tried desiredTag as p
Am I missing something?
You are definitely experiencing the differences between different parsers used by BeautifulSoup. html.parser and lxml worked for me:
data = BeautifulSoup(urllib2.urlopen(value), 'html.parser')
Proof:
>>> import urllib2
>>> from bs4 import BeautifulSoup
>>>
>>> url = "http://www.presidency.ucsb.edu/ws/index.php?pid=99556"
>>>
>>> data = BeautifulSoup(urllib2.urlopen(url), 'html.parser')
>>> data.find("span", class_="displaytext").text
u'PARTICIPANTS:Former Speaker of the House Newt Gingrich (GA);
...

What is wrong with my web scraper code (python3.4)

I am trying to scrape a table from a website. It runs but I am not getting an output to my file. Where am I going wrong?
Code:
from bs4 import BeautifulSoup
import urllib.request
f = open('nbapro.txt','w')
errorFile = open('nbaerror.txt','w')
page = urllib.request.urlopen('http://www.numberfire.com/nba/fantasy/full-fantasy-basketball-projections')
content = page.read()
soup = BeautifulSoup(content)
tableStats = soup.find('table', {'class': 'data-table xsmall'})
for row in tableStats.findAll('tr')[2:]:
col = row.findAll('td')
try:
name = col[0].a.string.strip()
f.write(name+'\n')
except Exception as e:
errorFile.write (str(e) + '******'+ str(col) + '\n')
pass
f.close
errorFile.close
The problem is that the table data you are trying to scrape is filled out by invoking javascript code on the browser-side. urllib is not a browser and, hence, cannot execute javascript.
If you want to solve it via urllib and BeautifulSoup, you have to extract the JSON object from the script tag and load it via json.loads(). Example, that prints player names:
import json
import re
import urllib.request
from bs4 import BeautifulSoup
soup = BeautifulSoup(urllib.request.urlopen('http://www.numberfire.com/nba/fantasy/full-fantasy-basketball-projections'))
script = soup.find('script', text=lambda x: x and 'NF_DATA' in x).text
data = re.search(r'NF_DATA = (.*?);', script).group(1)
data = json.loads(data)
for player_id, player in data['players'].items():
print(player['name'] + ' ' + player['last_name'])

Categories