Why does TypeError appear occasionally? - python

My python scrapping program is running into TypeError.
Here's my code:
from bs4 import BeautifulSoup
import requests, feedparser
cqrss = feedparser.parse('https://www.reddit.com/r/pics/new.rss')
for submission in cqrss.entries:
folder_name = submission.title #use for create folder
reddit_url = submission.link
source = requests.get(reddit_url)
plain_text = source.content
soup = BeautifulSoup(plain_text, 'lxml')
title = soup.find('a', 'title may-blank outbound', href=True)
if 'imgur.com' in title['href']:
imgur_link = title['href']
print(imgur_link)
Error:
if 'imgur.com' in title['href']:
TypeError: 'NoneType' object is not subscriptable
What did I do wrong?

find "fails" (i.e. does not find anything) for some data and returns None.
if title and 'imgur.com' in title['href']:
imgur_link = title['href']
print(imgur_link)
should work.
Note that print was moved under the if clause, as it obviously does not make sense to call it, if data isn't there.

Related

BeautifulSoup and Lists

I am attempting to use beautifulsoup to look through and request each url in a txt file. So far I am able to scrape the first link for what I seek, progressing to the next url I hit an error.
This is the error I keep getting:
AttributeError: ResultSet object has no attribute 'find'. You're
probably treating a list of elements like a single element. Did you
call find_all() when you meant to call find()?
from bs4 import BeautifulSoup as bs
import requests
import constants as c
file = open(c.fvtxt)
read = file.readlines()
res = []
DOMAIN = c.vatican_domain
pdf = []
def get_soup(url):
return bs(requests.get(url).text, 'html.parser')
for link in read:
bs = get_soup(link)
res.append(bs)
soup = bs.find('div', {'class': 'headerpdf'})
pdff = soup.find('a')
li = pdff.get('href')
surl = f"{DOMAIN}{li}"
pdf.append(f"{surl}\n")
print(pdf)
It's your variable name confuses the Python interpreter, you cannot have the same name as a function and a variable at the same time, in your case 'bs'.
It should work fine if you rename the variable bs to parsed_text or something else but bs.
for link in read:
parsed_text = get_soup(link)
res.append(parsed_text)
soup = parsed_text.find('div', {'class': 'headerpdf'})
pdff = soup.find('a')
li = pdff.get('href')
print(li)
surl = f"{DOMAIN}{li}"
pdf.append(f"{surl}\n")
print(pdf)
The result:

BeautifulSoup No Attribute 'get'

Trying to make a manga downloader. So far so good. But I'm so tired because I've been doing this all day.
import requests
from bs4 import BeautifulSoup
title = input("Title: ")
chapter = input("Chapter: ")
result = requests.get("http://www.mangapanda.com/{}/{}".format(title, chapter))
src = result.content
soup = BeautifulSoup(src, "html.parser")
find = soup.find(id="selectpage").findAll('option') # A drop down menu
for i in range(len(find)): # Find how many options are in the drop down menu
result2 = requests.get("http://www.mangapanda.com/{}/{}/{}".format(title, chapter, i + 1))
src2 = result2.content
soup2 = BeautifulSoup(src2, "html.parser")
image = soup2.find(id="img").get('src')
title = soup2.find(id="img").get('alt')
with open(title + ".jpeg", 'wb') as f:
f.write(requests.get(image).content)
When I test it and write, "bleach", "4"
It Gives me an error
image = soup2.find(id="img").get('src')
AttributeError: 'NoneType' object has no attribute 'get'
Which is weird since it downloads the first page.
Thanks In Advance!
Also it worked earlier, it downloaded everything but I changed a few lines here and there and I don't know the rest.. I'm so dumb.
[edited]
the problem is when you do title = soup2.find(id="img").get('alt') you change the variable title you are using in requests.get("http://www.mangapanda.com/{}/{}/{}".format(title, chapter, i + 1)). Try to use a variable with a different name.

BS4 Not Locating Element in Python

I am somewhat new to Python and can't for the life of me figure out why the following code isn’t pulling the element I am trying to get.
It currently returns:
for player in all_players:
player_first, player_last = player.split()
player_first = player_first.lower()
player_last = player_last.lower()
first_name_letters = player_first[:2]
last_name_letters = player_last[:5]
player_url_code = '/{}/{}{}01'.format(last_name_letters[0], last_name_letters, first_name_letters)
player_url = 'https://www.basketball-reference.com/players' + player_url_code + '.html'
print(player_url) #test
req = urlopen(player_url)
soup = bs.BeautifulSoup(req, 'lxml')
wrapper = soup.find('div', id='all_advanced_pbp')
table = wrapper.find('div', class_='table_outer_container')
for td in table.find_all('td'):
player_pbp_data.append(td.get_text())
Currently returning:
--> for td in table.find_all('td'):
player_pbp_data.append(td.get_text()) #if this works, would like to
AttributeError: 'NoneType' object has no attribute 'find_all'
Note: iterating through children of the wrapper object returns:
< div class="table_outer_container" > as part of the tree.
Thanks!
Make sure that table contains the data you expect.
For example https://www.basketball-reference.com/players/a/abdulka01.html doesn't seem to contain a div with id='all_advanced_pbp'
Try to explicitly pass the html instead:
bs.BeautifulSoup(the_html, 'html.parser')
I trie to extract data from the url you gave but it did not get full DOM. after then i try to access the page with browser with javascrip and without javascrip, i know website need javascrip to load some data. But the page like players it need not. The simple way to get dynamic data is using selenium
This is my test code
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
player_pbp_data = []
def get_list(t="a"):
with requests.Session() as se:
url = "https://www.basketball-reference.com/players/{}/".format(t)
req = se.get(url)
soup = BeautifulSoup(req.text,"lxml")
with open("a.html","wb") as f:
f.write(req.text.encode())
table = soup.find("div",class_="table_wrapper setup_long long")
players = {player.a.text:"https://www.basketball-reference.com"+player.a["href"] for player in table.find_all("th",class_="left ")}
def get_each_player(player_url="https://www.basketball-reference.com/players/a/abdulta01.html"):
with webdriver.Chrome() as ph:
ph.get(player_url)
text = ph.page_source
'''
with requests.Session() as se:
text = se.get(player_url).text
'''
soup = BeautifulSoup(text, 'lxml')
try:
wrapper = soup.find('div', id='all_advanced_pbp')
table = wrapper.find('div', class_='table_outer_container')
for td in table.find_all('td'):
player_pbp_data.append(td.get_text())
except Exception as e:
print("This page dose not contain pbp")
get_each_player()

web-scraping using python ('NoneType' object is not iterable)

I am new to python and web-scraping. I am trying to scrape a website (link is the url). I am getting an error as "'NoneType' object is not iterable", with the last line of below code. Could anyone point what could have gone wrong?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = 'https://labtestsonline.org/tests-index'
soup = BeautifulSoup(requests.get(url).content, 'lxml')
# Function to get hyper-links for all test components
hyperlinks = []
def parseUrl(url):
global hyperlinks
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')
for a in soup.findAll('div',{'class':'field-content'}):
a = a.find('a')
href = urlparse.urljoin(Url,a.get('href'))
hyperlinks.append(href)
parseUrl(url)
# function to get header and common questions for each test component
def header(url):
page = requests.get(url).content
soup = BeautifulSoup(page, 'lxml')
h = []
commonquestions = []
for head in soup.find('div',{'class':'field-item'}).find('h1'):
heading = head.get_text()
h.append(heading)
for q in soup.find('div',{'id':'Common_Questions'}):
questions = q.get_text()
commonquestions.append(questions)
for i in range(0, len(hyperlinks)):
header(hyperlinks[i])
Below is the traceback error:
<ipython-input-50-d99e0af6db20> in <module>()
1 for i in range(0, len(hyperlinks)):
2 header(hyperlinks[i])
<ipython-input-49-15ac15f9071e> in header(url)
5 soup = BeautifulSoup(page, 'lxml')
6 h = []
for head in soup.find('div',{'class':'field-item'}).find('h1'):
heading = head.get_text()
h.append(heading)
TypeError: 'NoneType' object is not iterable
soup.find('div',{'class':'field-item'}).find('h1') is returning None. First check whether the function returns anything before looping over it.
Something like:
heads = soup.find('div',{'class':'field-item'}).find('h1')
if heads:
for head in heads:
# remaining code
Try this. It should solve the issues you are having at this moment. I used css selector to get the job done.
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
link = 'https://labtestsonline.org/tests-index'
page = requests.get(link)
soup = BeautifulSoup(page.content, 'lxml')
for a in soup.select('.field-content a'):
new_link = urljoin(link,a.get('href')) ##joining broken urls so as to reuse these
response = requests.get(new_link) ##sending another http requests
sauce = BeautifulSoup(response.text,'lxml')
for item in sauce.select("#Common_Questions .field-item"):
print(item.text)
print("<<<<<<<<<>>>>>>>>>>>")

Non Type object has no attribute get error

I am trying to get href link from a tag but when I use method .get it gives me error "non type object has no attribute get"
here is my code:
#Loading Libraries
import urllib
import urllib.request
from bs4 import BeautifulSoup
#define URL for scraping
theurl = "http://www.techspot.com/"
thepage = urllib.request.urlopen(theurl)
#Cooking the Soup
soup = BeautifulSoup(thepage,"html.parser")
i=1
#Scraping "Project Title" (project-title)
title = soup.findAll('div', {'class': 'article-category'})
for titles in title:
titleheading = soup.findAll('h2')
for titletext in titleheading:
titlename = titletext.a
titlelink =titlename.get('href')
print(i)
print(titlelink)
i+=1
and here is the console screenshot of the error I am getting
Tell me what's the problem in this code or why i am getting this error
You need to check if titletext.a is None before you can use it for sure.
for titles in title:
titleheading = soup.findAll('h2')
for titletext in titleheading:
if titletext.a:
titlename = titletext.a
titlelink =titlename.get('href')
print(i)
print(titlelink)
i+=1

Categories