BeautifulSoup and Lists - python

I am attempting to use beautifulsoup to look through and request each url in a txt file. So far I am able to scrape the first link for what I seek, progressing to the next url I hit an error.
This is the error I keep getting:
AttributeError: ResultSet object has no attribute 'find'. You're
probably treating a list of elements like a single element. Did you
call find_all() when you meant to call find()?
from bs4 import BeautifulSoup as bs
import requests
import constants as c
file = open(c.fvtxt)
read = file.readlines()
res = []
DOMAIN = c.vatican_domain
pdf = []
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
for link in read:
bs = get_soup(link)
res.append(bs)
soup = bs.find('div', {'class': 'headerpdf'})
pdff = soup.find('a')
li = pdff.get('href')
surl = f"{DOMAIN}{li}"
pdf.append(f"{surl}\n")
print(pdf)

It's your variable name confuses the Python interpreter, you cannot have the same name as a function and a variable at the same time, in your case 'bs'.
It should work fine if you rename the variable bs to parsed_text or something else but bs.
for link in read:
parsed_text = get_soup(link)
res.append(parsed_text)
soup = parsed_text.find('div', {'class': 'headerpdf'})
pdff = soup.find('a')
li = pdff.get('href')
print(li)
surl = f"{DOMAIN}{li}"
pdf.append(f"{surl}\n")
print(pdf)
The result:

Related

BS4 Not Locating Element in Python

I am somewhat new to Python and can't for the life of me figure out why the following code isn’t pulling the element I am trying to get.
It currently returns:
for player in all_players:
player_first, player_last = player.split()
player_first = player_first.lower()
player_last = player_last.lower()
first_name_letters = player_first[:2]
last_name_letters = player_last[:5]
player_url_code = '/{}/{}{}01'.format(last_name_letters[0], last_name_letters, first_name_letters)
player_url = 'https://www.basketball-reference.com/players' + player_url_code + '.html'
print(player_url) #test
req = urlopen(player_url)
soup = bs.BeautifulSoup(req, 'lxml')
wrapper = soup.find('div', id='all_advanced_pbp')
table = wrapper.find('div', class_='table_outer_container')
for td in table.find_all('td'):
player_pbp_data.append(td.get_text())
Currently returning:
--> for td in table.find_all('td'):
player_pbp_data.append(td.get_text()) #if this works, would like to
AttributeError: 'NoneType' object has no attribute 'find_all'
Note: iterating through children of the wrapper object returns:
< div class="table_outer_container" > as part of the tree.
Thanks!
Make sure that table contains the data you expect.
For example https://www.basketball-reference.com/players/a/abdulka01.html doesn't seem to contain a div with id='all_advanced_pbp'
Try to explicitly pass the html instead:
bs.BeautifulSoup(the_html, 'html.parser')
I trie to extract data from the url you gave but it did not get full DOM. after then i try to access the page with browser with javascrip and without javascrip, i know website need javascrip to load some data. But the page like players it need not. The simple way to get dynamic data is using selenium
This is my test code
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
player_pbp_data = []
def get_list(t="a"):
with requests.Session() as se:
url = "https://www.basketball-reference.com/players/{}/".format(t)
req = se.get(url)
soup = BeautifulSoup(req.text,"lxml")
with open("a.html","wb") as f:
f.write(req.text.encode())
table = soup.find("div",class_="table_wrapper setup_long long")
players = {player.a.text:"https://www.basketball-reference.com"+player.a["href"] for player in table.find_all("th",class_="left ")}
def get_each_player(player_url="https://www.basketball-reference.com/players/a/abdulta01.html"):
with webdriver.Chrome() as ph:
ph.get(player_url)
text = ph.page_source
'''
with requests.Session() as se:
text = se.get(player_url).text
'''
soup = BeautifulSoup(text, 'lxml')
try:
wrapper = soup.find('div', id='all_advanced_pbp')
table = wrapper.find('div', class_='table_outer_container')
for td in table.find_all('td'):
player_pbp_data.append(td.get_text())
except Exception as e:
print("This page dose not contain pbp")
get_each_player()

web-scraping using python ('NoneType' object is not iterable)

I am new to python and web-scraping. I am trying to scrape a website (link is the url). I am getting an error as "'NoneType' object is not iterable", with the last line of below code. Could anyone point what could have gone wrong?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://labtestsonline.org/tests-index'
soup = BeautifulSoup(requests.get(url).content, 'lxml')
# Function to get hyper-links for all test components
hyperlinks = []
def parseUrl(url):
global hyperlinks
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')
for a in soup.findAll('div',{'class':'field-content'}):
a = a.find('a')
href = urlparse.urljoin(Url,a.get('href'))
hyperlinks.append(href)
parseUrl(url)
# function to get header and common questions for each test component
def header(url):
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')
h = []
commonquestions = []
for head in soup.find('div',{'class':'field-item'}).find('h1'):
heading = head.get_text()
h.append(heading)
for q in soup.find('div',{'id':'Common_Questions'}):
questions = q.get_text()
commonquestions.append(questions)
for i in range(0, len(hyperlinks)):
header(hyperlinks[i])
Below is the traceback error:
<ipython-input-50-d99e0af6db20> in <module>()
1 for i in range(0, len(hyperlinks)):
2 header(hyperlinks[i])
<ipython-input-49-15ac15f9071e> in header(url)
5 soup = BeautifulSoup(page, 'lxml')
6 h = []
for head in soup.find('div',{'class':'field-item'}).find('h1'):
heading = head.get_text()
h.append(heading)
TypeError: 'NoneType' object is not iterable
soup.find('div',{'class':'field-item'}).find('h1') is returning None. First check whether the function returns anything before looping over it.
Something like:
heads = soup.find('div',{'class':'field-item'}).find('h1')
if heads:
for head in heads:
# remaining code
Try this. It should solve the issues you are having at this moment. I used css selector to get the job done.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
link = 'https://labtestsonline.org/tests-index'
page = requests.get(link)
soup = BeautifulSoup(page.content, 'lxml')
for a in soup.select('.field-content a'):
new_link = urljoin(link,a.get('href')) ##joining broken urls so as to reuse these
response = requests.get(new_link) ##sending another http requests
sauce = BeautifulSoup(response.text,'lxml')
for item in sauce.select("#Common_Questions .field-item"):
print(item.text)
print("<<<<<<<<<>>>>>>>>>>>")

Why does TypeError appear occasionally?

My python scrapping program is running into TypeError.
Here's my code:
from bs4 import BeautifulSoup
import requests, feedparser
cqrss = feedparser.parse('https://www.reddit.com/r/pics/new.rss')
for submission in cqrss.entries:
folder_name = submission.title #use for create folder
reddit_url = submission.link
source = requests.get(reddit_url)
plain_text = source.content
soup = BeautifulSoup(plain_text, 'lxml')
title = soup.find('a', 'title may-blank outbound', href=True)
if 'imgur.com' in title['href']:
imgur_link = title['href']
print(imgur_link)
Error:
if 'imgur.com' in title['href']:
TypeError: 'NoneType' object is not subscriptable
What did I do wrong?
find "fails" (i.e. does not find anything) for some data and returns None.
if title and 'imgur.com' in title['href']:
imgur_link = title['href']
print(imgur_link)
should work.
Note that print was moved under the if clause, as it obviously does not make sense to call it, if data isn't there.

BeautifulSoup: Extracting string between tags does not seem to work

I am currently parsing this url. the Url will be the argument for the parse function.
def parse(sitemap):
req = urllib.request.urlopen(sitemap)
soup = BeautifulSoup(req, 'lxml')
soup.prettify()
inventory_url = []
inventory_url_set = set()
for item in soup.find_all('url'):
print(item.find('lastmod'))
# print(item.find('lastmod').text)
inventory_url_set.add(item.find('loc').text)
However, item.find('lastmod').text retuns an AttributeError whereas if I were to print the whole tag item.find('lastmod') it works fine.
I'd like to only obtain the text in between the 'lastmod' tag from within each 'item'.
Thanks
Not all of the url entries contain a lastmod, so you need to test for that. If you use a dictionary, you could store the lastmod as values and still benefit from having unique URLs as follows:
from bs4 import BeautifulSoup
import urllib.request
def parse(sitemap):
req = urllib.request.urlopen(sitemap)
soup = BeautifulSoup(req, 'lxml')
inventory_urls = {}
for url in soup.find_all('url'):
if url.lastmod:
lastmod = url.lastmod.text
else:
lastmod = None
inventory_urls[url.loc.text] = lastmod
for url, lastmod in inventory_urls.items():
print(lastmod, url)
parse("https://www.kith.com/sitemap_products_1.xml")
This would give you a list starting as follows:
2017-02-12T03:55:25Z https://kith.com/products/adidas-originals-stan-smith-wool-pk-grey-white
2017-03-13T18:55:24Z https://kith.com/products/norse-projects-niels-pocket-boucle-tee-black
2017-03-15T17:20:47Z https://kith.com/products/ronnie-fieg-x-fracap-rf120-rust
2017-03-17T01:30:25Z https://kith.com/products/new-balance-696-birch
2017-01-23T08:43:56Z https://kith.com/products/ronnie-fieg-x-diamond-supply-co-x-asics-gel-lyte-v-1
2017-03-17T00:41:03Z https://kith.com/products/off-white-diagonal-ferns-hoodie-black
2017-03-16T15:01:55Z https://kith.com/products/norse-projects-skagen-bubble-crewneck-charcoal
2017-02-21T15:57:56Z https://kith.com/products/vasque-eriksson-gtx-brown-black

Web crawler - following links

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?
Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!
A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]
This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.
Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()

Categories