BeautifulSoup No Attribute 'get' - python

Trying to make a manga downloader. So far so good. But I'm so tired because I've been doing this all day.
import requests
from bs4 import BeautifulSoup
title = input("Title: ")
chapter = input("Chapter: ")
result = requests.get("http://www.mangapanda.com/{}/{}".format(title, chapter))
src = result.content
soup = BeautifulSoup(src, "html.parser")
find = soup.find(id="selectpage").findAll('option') # A drop down menu
for i in range(len(find)): # Find how many options are in the drop down menu
result2 = requests.get("http://www.mangapanda.com/{}/{}/{}".format(title, chapter, i + 1))
src2 = result2.content
soup2 = BeautifulSoup(src2, "html.parser")
image = soup2.find(id="img").get('src')
title = soup2.find(id="img").get('alt')
with open(title + ".jpeg", 'wb') as f:
f.write(requests.get(image).content)
When I test it and write, "bleach", "4"
It Gives me an error
image = soup2.find(id="img").get('src')
AttributeError: 'NoneType' object has no attribute 'get'
Which is weird since it downloads the first page.
Thanks In Advance!
Also it worked earlier, it downloaded everything but I changed a few lines here and there and I don't know the rest.. I'm so dumb.

[edited]
the problem is when you do title = soup2.find(id="img").get('alt') you change the variable title you are using in requests.get("http://www.mangapanda.com/{}/{}/{}".format(title, chapter, i + 1)). Try to use a variable with a different name.

Related

I've been trying to scrape profile pictures in Instagram using this code but i keep getting TypeError: 'NoneType' object is not subscriptable

import requests
from bs4 import BeautifulSoup as bs
User = input("input the username of the user ");
url = 'https://instagram.com/' + User +'/'
r = requests.get(url)
alt = User + '\'s profile picture'
soup = bs(r.content, 'html.parser')
userImage = soup.find('img',{'alt': alt})['src']
print(userImage)
the code above is what I'm using this is written in python the line thats giving me issues is the one that says userImage = soup.find('img',{'alt': alt})['src']
Beautiful soup is unable to find the element you specified, so find is returning None. Then, when you index it with 'src', it tells you that a NoneType is not subscriptable. Make sure the element you're looking for actually exists and your argument to find is correct.
import urllib.parse
import urllib.request
import requests
def profilephoto(username):
url = 'https://www.instagram.com/' + username+"/?__a=1"
r = requests.get(url).text
start = '"profile_pic_url_hd":"'
end = '","requested_by_viewer":'
profilepic = r[r.find(start) + len(start):r.rfind(end)]
profilepicurl = profilepic.replace("\\u0026", "&")
if len(profilepicurl)>0:
resource = urllib.request.urlopen(profilepicurl)
output = open(username+".jpg", "wb")
output.write(resource.read())
output.close()
return

Multiple for loops and csv files

I am new here and newbie with python and currently learning some basic stuff, mostly scraping and I encountered a problem that I hope you can help me to solve.
I'm trying to scrape few details from a website and writing them into a CSV file but I'm able to write only the last results into my CSV, apparently my script just overwrite the data.
Also if you find any mistakes on my code or any room for improvement (which I'm sure there are) I'd be glad if you will point them out as well.
Also2, any recommendation for videos/tutorials that can help me improve my python and scraping skills would be appreciated.
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup (source, 'lxml')
csv_file = open('contacts.csv', 'w')
csv_writer = csv.writer (csv_file)
csv_writer.writerow(["department", "name", "position", "phone"])
for department in soup.find_all("div", class_="view-content"):
department_name = department.h3
print (department_name.text)
for contacts in soup.find_all("div", class_="col-md-7 col-xs-10"):
contact_name = contacts.strong
print(contact_name.text)
for position in soup.find_all("div", class_="field-content"):
print(position.text)
for phone in soup.find_all("div", class_="modal-content"):
first_phone = phone.h3
first_phones = first_phone
print(first_phones)
csv_writer.writerow([department_name, contact_name, position, first_phones])
csv_file.close()
Thanks Thomas,
Actually I tweaked my code a little bit by thinking how I can make it simpler (four for loops are too much, no?) so with the following code I solved my problem(dropped the 'department' and 'phones' because some other issues):
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup (source, 'lxml')
f = open("contactslot.csv", "w+")
csv_writer = csv.writer (f)
csv_writer.writerow(["Name", "Position"])
infomation = soup.find_all("div", class_="well profile")
info = information[0]
for info in information:
contact_name = info.find_all("div", class_="col-md-7 col-xs-10")
names = contact_name[0].strong
name = names.text
print (name)
position_name = info.find_all("div", class_="field-content")
position = position_name[0].text
print(position)
print("")
csv_writer.writerow([name, position])
f.close()
Hi Babr welcome to use python. Your answer is good and here is one more little thing you may can do better.
use find replace find_all if you just want one element
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://www.tamarackgc.com/club-contacts'
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
f = open("/Users/mingjunliu/Downloads/contacts.csv", "w+")
csv_writer = csv.writer(f)
csv_writer.writerow(["Name", "Position"])
for info in soup.find_all("div", class_="well profile"):
contact_name = info.find("div", class_="col-md-7 col-xs-10")
names = contact_name.strong
name = names.text
print(name)
position_name = info.find("div", class_="field-content")
position = position_name.text
print(position)
print("")
csv_writer.writerow([name, position])
f.close()
And the reason you need to drop phone and department is because of the bad website structure. It's not your fault.

BS4 Not Locating Element in Python

I am somewhat new to Python and can't for the life of me figure out why the following code isn’t pulling the element I am trying to get.
It currently returns:
for player in all_players:
player_first, player_last = player.split()
player_first = player_first.lower()
player_last = player_last.lower()
first_name_letters = player_first[:2]
last_name_letters = player_last[:5]
player_url_code = '/{}/{}{}01'.format(last_name_letters[0], last_name_letters, first_name_letters)
player_url = 'https://www.basketball-reference.com/players' + player_url_code + '.html'
print(player_url) #test
req = urlopen(player_url)
soup = bs.BeautifulSoup(req, 'lxml')
wrapper = soup.find('div', id='all_advanced_pbp')
table = wrapper.find('div', class_='table_outer_container')
for td in table.find_all('td'):
player_pbp_data.append(td.get_text())
Currently returning:
--> for td in table.find_all('td'):
player_pbp_data.append(td.get_text()) #if this works, would like to
AttributeError: 'NoneType' object has no attribute 'find_all'
Note: iterating through children of the wrapper object returns:
< div class="table_outer_container" > as part of the tree.
Thanks!
Make sure that table contains the data you expect.
For example https://www.basketball-reference.com/players/a/abdulka01.html doesn't seem to contain a div with id='all_advanced_pbp'
Try to explicitly pass the html instead:
bs.BeautifulSoup(the_html, 'html.parser')
I trie to extract data from the url you gave but it did not get full DOM. after then i try to access the page with browser with javascrip and without javascrip, i know website need javascrip to load some data. But the page like players it need not. The simple way to get dynamic data is using selenium
This is my test code
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
player_pbp_data = []
def get_list(t="a"):
with requests.Session() as se:
url = "https://www.basketball-reference.com/players/{}/".format(t)
req = se.get(url)
soup = BeautifulSoup(req.text,"lxml")
with open("a.html","wb") as f:
f.write(req.text.encode())
table = soup.find("div",class_="table_wrapper setup_long long")
players = {player.a.text:"https://www.basketball-reference.com"+player.a["href"] for player in table.find_all("th",class_="left ")}
def get_each_player(player_url="https://www.basketball-reference.com/players/a/abdulta01.html"):
with webdriver.Chrome() as ph:
ph.get(player_url)
text = ph.page_source
'''
with requests.Session() as se:
text = se.get(player_url).text
'''
soup = BeautifulSoup(text, 'lxml')
try:
wrapper = soup.find('div', id='all_advanced_pbp')
table = wrapper.find('div', class_='table_outer_container')
for td in table.find_all('td'):
player_pbp_data.append(td.get_text())
except Exception as e:
print("This page dose not contain pbp")
get_each_player()

Why does TypeError appear occasionally?

My python scrapping program is running into TypeError.
Here's my code:
from bs4 import BeautifulSoup
import requests, feedparser
cqrss = feedparser.parse('https://www.reddit.com/r/pics/new.rss')
for submission in cqrss.entries:
folder_name = submission.title #use for create folder
reddit_url = submission.link
source = requests.get(reddit_url)
plain_text = source.content
soup = BeautifulSoup(plain_text, 'lxml')
title = soup.find('a', 'title may-blank outbound', href=True)
if 'imgur.com' in title['href']:
imgur_link = title['href']
print(imgur_link)
Error:
if 'imgur.com' in title['href']:
TypeError: 'NoneType' object is not subscriptable
What did I do wrong?
find "fails" (i.e. does not find anything) for some data and returns None.
if title and 'imgur.com' in title['href']:
imgur_link = title['href']
print(imgur_link)
should work.
Note that print was moved under the if clause, as it obviously does not make sense to call it, if data isn't there.

How to rectify a TypeError in python?Beautiful Soup string to tag error?

Here is simple snippet to scrape Wikipedia website and to print each of its contents separately like cast in separate variable and production in separate variable and so on ..
Here in the first div named "bodyContent" there is a another div names "mw-content-text" here my problem is retrieve the data of the first paragraphs before the tag "h2" and i have a code snippet to work out this and unable to convert from BeautifulSoup tag from string and the error is TypeError: unsupported operand type(s) for +: 'Tag' and 'str'
import urllib
from bs4 import BeautifulSoup
url ="https://en.wikipedia.org/wiki/Deadpool_(film)"
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
soup = BeautifulSoup(htmltext,"lxml")
#print soup.prettify()
movie_title = soup.find('h1',{'id':'firstHeading'})
print movie_title.text
movie_info = soup.find_all('p')
#print movie_info[0].text
#print movie_info[1].text
'''I dont want like this because we dont know how many
intro paragraphs will be so we have to scrape all paras just before that h2 tag'''
Here the problem rises i want to iterate and add .next_sibling and to make a try-exception block to find if the
"resultant_next_url.name == 'p' "
def findNextSibling(base_url):
tag_addition = 'next_sibling'
next_url = base_url+'.'+tag_addition
return next_url
And finally to do like this
base_url = movie_info[0]
resultant_url = findNextSibling(base_url)
print resultant_url.text
Finally found answer, this is solving the problem
import urllib
from bs4 import BeautifulSoup
url ="https://en.wikipedia.org/wiki/Deadpool_(film)"
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
soup = BeautifulSoup(htmltext,"lxml")
#print soup.prettify()
movie_title = soup.find('h1',{'id':'firstHeading'})
print movie_title.text
movie_info = soup.find_all('p')
# print movie_info[0].text
# print movie_info[1].text
def findNextSibling(resultant_url):
#tag_addition = 'next_sibling'
#base_url.string = base_url.string + '.' + tag_addition
return resultant_url.next_sibling
resultant_url = movie_info[0]
resultant_url = findNextSibling(resultant_url)
print resultant_url.text

Categories