How do I extract only the content from this webpage - python

I am trying out webscraping using BeautifulSoup.
I only want extract the content from this webpage basically everything from Barry Kripke without all the headers..etc.
https://bigbangtheory.fandom.com/wiki/Barry_Kripke
I tried this, but it doesn't give me what I want
quote = 'https://bigbangtheory.fandom.com/wiki/Barry_Kripke'
http = urllib3.PoolManager()
r = http.request('GET', quote)
if r.status == 200:
page = r.data
print('Type of the variable \'page\':', page.__class__.__name__)
print('Page Retrieved. Request Status: %d, Page Size: %d' % (r.status, len(page)))
else:
print('Some problem occurred. Request Status: %s' % r.status)
soup = BeautifulSoup(page, 'html.parser')
print('Type of the variable \'soup\':', soup.__class__.__name__)
print(soup.prettify()[:1000])
article_tag = 'p'
article = soup.find_all(article_tag)[0]
print(f'Type of the variable "article":{article.__class__.__name__}')
article.text
The output I get is below, which is just the first paragraph
What I want is this:
Next I tried to get all the links, but that didn't work either - I got only 2 links:
for t in article.find_all('a'):
print(t)
Please can someone help me with this.

You only grab and print out the 1st <p> tag with article = soup.find_all(article_tag)[0]
You need to go through all the <p> tags:
import requests
from bs4 import BeautifulSoup
url = 'https://bigbangtheory.fandom.com/wiki/Barry_Kripke'
r = requests.get(url)
if r.status_code == 200:
page = r.text
print('Type of the variable \'page\':', page.__class__.__name__)
print('Page Retrieved. Request Status: %d, Page Size: %d' % (r.status_code, len(page)))
else:
print('Some problem occurred. Request Status: %s' % r.status_code)
soup = BeautifulSoup(page, 'html.parser')
print('Type of the variable \'soup\':', soup.__class__.__name__)
print(soup.prettify()[:1000])
article_tag = 'p'
articles = soup.find_all(article_tag)
for p in articles:
print(p.text)

Related

Python's request not scraping right content from Google news

I'm trying to scrape all news headlines from Google News (note: not via news.google.com) with the following conditions:
i. keyword(s),
ii. specific date range,
iii. sorted by date, and
iv. able to loop through the pages
This is the link of a regular google search with specified keywords:
https://www.google.com/search?q=migrant%2Bcaravans&rlz=1C1GCEA_enUS827US827&sxsrf=ACYBGNT3ExxxPO5PSo9Cgp91M37sVBHLMA:1576086735805&source=lnms&tbm=nws&sa=X&ved=2ahUKEwji9pbQlK7mAhWIxFkKHWDQCCcQ_AUoAXoECBAQAw&biw=1680&bih=939
And this is the link of my google with the same keywords with sorted by date and date range:
https://www.google.com/search?q=migrant%2Bcaravans&rlz=1C1GCEA_enUS827US827&tbs=cdr:1,cd_min:1/1/2017,cd_max:12/31/2017,sbd:1&tbm=nws&sxsrf=ACYBGNRZjtVzEEfuEKcHjuOYUmubi5pT3g:1576086970386&source=lnt&sa=X&ved=0ahUKEwjc1oTAla7mAhWExVkKHQlVB_YQpwUIIA&biw=1680&bih=939&dpr=1
This is a sample of my code that is able to scrape the headlines from a regular search without any of the conditions imposed:
def scrape_news_summaries(topic, pagenum=1):
#time.sleep(randint(0, 2))
url = "http://www.google.com/search?q="+topic+"&tbm=nws&dpr=" + str(pagenum)
r = requests.get(url)
if r.status_code != 200:
print('status code for ' + url + ' was ' + str(r.status_code))
sys.exit(-1)
soup = BeautifulSoup(r.text, "html.parser")
return soup
scrape_news_summaries("migrant+caravans")
This is the code with the altered the URL to include a date range and have the search sorted by date:
def scrape_news_date_range(query, min_date, max_date, pagenum=1):
url = "https://www.google.com/search?q="+query+"&rlz=1C1GCEA_enUS827US827&tbs=cdr:1,cd_min:"+min_date+",cd_max:"+max_date+",sbd:1&tbm=nws/*,ned=es_sv*/&dpr="+str(pagenum)
r = requests.get(url)
if r.status_code != 200:
print('status code for' + url + 'was' + str(r.status_code))
sys.exit(-1)
soup = BeautifulSoup(r.text, "html.parser")
#return soup
print(soup)
scrape_news_date_range("migrant+caravans", "1/1/2017", "12/1/2017")
And it doesn't seem to return the same content as I would like to get from the second link which I shared above instead it returns the content of a regular search.
I greatly appreciate any help with this! Thank you so much!

Web scraping nested comments on Reddit using beautifulsoup

This code gets the page. My problem is I need to scrape the content of users comments not the number of comments. It is nested inside the number of comments section but I am not sure how I can access the link and parse through and scrape the user comments.
request_list = []
id_list = [0]
for i in range(0,200,25):
response = requests.get("https://www.reddit.com/r/CryptoCurrency/?count="+str(i)+"&after="+str(id_list[-1]), headers = {'User-agent':'No Bot'})
soup = BeautifulSoup(response.content, 'lxml')
request_list.append(soup)
id_list.append(soup.find_all('div', attrs={'data-type': 'link'})[-1]['data-fullname'])
print(i, id_list)
if i%100 == 0:
time.sleep(1)
The code below I tried writing a function that is supposed to access the nested comments but I have no clue.
def extract_comment_contents(request_list):
comment_contents_list = []
for i in request_list:
if response.status_code == 200:
for each in i.find_all('a', attrs={'data-inbound-url': '/r/CryptoCurrency/comments/'}):
comment_contents_list.append(each.text)
else:
print("Call failed at request ", i)
return comment_contents_list
fetch_comment_contents_list = extract_comment_contents(request_list)
print(fetch_comment_contents_list)
For each thread, you need to send another request to get the comments page. The url for the comments page can be found using soup.find_all('a', class_='bylink comments may-blank'). This will give all the a tags that have to url for the comments page. I'll show you one example to get to the comments page.
r = requests.get('https://www.reddit.com/r/CryptoCurrency/?count=0&after=0')
soup = BeautifulSoup(r.text, 'lxml')
for comments_tag in soup.find_all('a', class_='bylink comments may-blank', href=True):
url = comments_tag['href']
r2 = requests.get(url)
soup = BeautifulSoup(r2.text, 'lxml')
# Your job is to parse this soup object and get all the comments.

none returned when trying to get tag value

In this html snippet from https://letterboxd.com/shesnicky/list/top-50-favourite-films/, I'm trying to go through all the different li tags and get the info from 'data-target-link' so I can then use that to create a new link that takes me to the page for that film, however every time I try and get the data it simply returns None or an error along those lines.
<li class="poster-container numbered-list-item" data-owner-rating="10"> <div class="poster film-poster really-lazy-load" data-image-width="125" data-image-height="187" data-film-slug="/film/donnie-darko/" data-linked="linked" data-menu="menu" data-target-link="/film/donnie-darko/" > <img src="https://s3.ltrbxd.com/static/img/empty-poster-125.c6227b2a.png" class="image" width="125" height="187" alt="Donnie Darko"/><span class="frame"><span class="frame-title"></span></span> </div> <p class="list-number">1</p> </li>
I'm going to be using the links to grab imgs for a twitter bot, so I tried doing this within my code:
class BotStreamer(tweepy.StreamListener):
print "Bot Streamer"
#on_data method of Tweepy’s StreamListener
#passes data from statuses to the on_status method
def on_status(self, status):
print "on status"
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
movies_ul = soup.find('ul', {'class':'poster-list -p125 -grid film-list'})
movies = []
for mov in movies_ul.find('data-film-slug'):
movies.append(mov)
rand = randint(0,51)
newLink = "https://letterboxd.com%s" % (str(movies[rand]))
newPage = requests.get(newLink)
code = BS(newPage.content, 'html.parser')
code_div = code.find\
('div', {'class':'react-component film-poster film-poster-51910 poster'})
image = code_div.find('img')
url = image.get('src')
username = status.user.screen_name
status_id = status.id
tweet_reply(url, username, status_id)
However, I kept getting errors about list being out of range, or not being able to iterate over NoneType. So I made a test prgrm just to see if I could somehow get the data:
import requests
from bs4 import BeautifulSoup as BS
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
movies_ul = soup.find('ul', {'class':'poster-list -p125 -grid film-list'})
more = movies_ul.find('li', {'class':'poster-container numbered-list-item'})
k = more.find('data-target-link')
print k
And again, all I get is None. Any help greatly appreciated.
Read doc: find() as first argument expects tag name, not attribute.
You may do
soup.find('div', {'data-target-link': True})
or
soup.find(attrs={'data-target-link': True})
Full example
import requests
from bs4 import BeautifulSoup as BS
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
all_items = soup.find_all('div', {'data-target-link': True})
for item in all_items:
print(item['data-target-link'])

Pagination with BeautifulSoup

I am trying to get some data from the following website. https://www.drugbank.ca/drugs
For every drug in the table, I will need to go deeply and have the name and some other specific features like categories, structured indication (please click on drug name to see the features I will use).
I wrote the following code but the issue that I can't make my code handle pagination (as you see there more than 2000 pages!).
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
for link in soup.select('name-head a'):
href = 'https://www.drugbank.ca/drugs/' + link.get('href')
pages_data(href)
def pages_data(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.text, "lxml")
g_data = soup.select('div.content-container')
for item in g_data:
print item.contents[1].text
print item.contents[3].findAll('td')[1].text
try:
print item.contents[5].findAll('td',{'class':'col-md-2 col-sm-4'})
[0].text
except:
pass
print item_url
drug_data()
How can I scrape all of the data and handle pagination properly?
This page uses almost the same url for all pages so you can use for loop to generate them
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
for x in range(1, 2001):
drug_data(x)
Or using while and try/except to get more then 2000 pages
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
page = 0
while True:
try:
page += 1
drug_data(page)
except Exception as ex:
print(ex)
print("probably last page:", page)
break # exit `while` loop
You can also find url to next page in HTML
<a rel="next" class="page-link" href="/drugs?approved=1&c=name&d=up&page=2">›</a>
so you can use BeautifulSoup to get this link and use it.
It displays current url, finds link to next page (using class="page-link" rel="next") and loads it
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
#data = soup.select('name-head a')
#for link in data:
# href = 'https://www.drugbank.ca/drugs/' + link.get('href')
# pages_data(href)
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
BTW: never use except:pass because you can have error which you didn't expect and you will not know why it doesn't work. Better display error
except Exception as ex:
print('Error:', ex)

error handling with BeautifulSoup when scraped url doesn't respond

I'm totally noob to python so please forgive my mistake and lack of vocabulary. I'm trying to scrap some url with BeautifulSoup. My url are coming from a GA api call and some of them doesn't respond.
How do I build my script so that BeautifulSoup ignore the url that doesn't return anything ?
Here is my code :
if results:
for row in results.get('rows'):
rawdata.append(row[0])
else:
print 'No results found'
urllist = [mystring + x for x in rawdata]
for row in urllist[4:8]:
page = urllib2.urlopen(row)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find(attrs={'class': 'nb-shares'})
share = name_box.text.strip()
# save the data in tuple
sharelist.append((row,share))
print(sharelist)
I tried to use this :
except Exception:
pass
but I don't know where and got some syntax error. I've look at other questions, but cannot find any answers for me.
You may check the value of name_box variable - it would be None if nothing found:
for row in urllist[4:8]:
page = urllib2.urlopen(row)
soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find(attrs={'class': 'nb-shares'})
if name_box is None:
continue
# ...

Categories