Simple Web Scraper returning no data - python

Im trying to scrape data from a webpage but its returning ["F"] ["F"] which is what it should do if no data has been retrieved. Please see Code below
`
import pandas as pd
import datetime
import requests
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup
def web_content_div(web_content, class_path):
web_content_div = web_content.find_all('div', {"class": class_path})
try:
spans = web_content_div[0].find_all('span')
texts =[span.get_text() for span in spans]
except IndexError:
texts=[]
return texts
def real_time_price(stock_code):
url = 'https://finance.yahoo.com/quote/' + stock_code + '?p=' + stock_code + '%27&.tsrc=fin-srch'
# 'https://finance.yahoo.com/quote/' + stock_code + '?p=' + stock_code + '&.tsrc=fin-srch'
try:
r = requests.get(url)
web_content = BeautifulSoup(r.text, 'lxml')
texts = web_content_div(web_content, 'My(6px) Pos(r) smarthphone_Mt(6px) W(100&%')
if texts != []:
price, change = texts[0], texts[1]
else:
price, change = ["F"], ["F"]
except ConnectionError:
price, change = [""], [""]
return price, change
Stock = ["BRK-B"]
print(real_time_price("BRK-B"))`

Your class_path doesn't exist due to a couple of typos. Website in question references "My(6px) Pos(r) smartphone_Mt(6px) W(100%)", which I believe is what you're targeting.

Related

How to extracting content from pagination next button?

This is the website I am trying to scrape:
(https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage)
Below is the code that I have tried,but it repetitively return me first page and third page.
from bs4 import BeautifulSoup
from urllib.request import urlopen
def parse():
base_url = 'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage'
url="https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=3"
while True:
html = urlopen(url)
soup = BeautifulSoup(html ,"html.parser")
for link in soup.find_all('div',class_='entry-content'):
try:
shops=soup.find_all('div',class_="col-9")
names=soup.find_all('tr',class_="clickable")
for n, k in zip(names, shops):
name = n.find_all('td')[1].text.replace(' ','')
desc = k.text.replace(' ','')
print(name + "\n")
print(desc)
except AttributeError as e:
print(e)
next_button = soup.find('a', href=True)
if next_button:
url = base_url + next_button['href']
else:
break
parse()
Select your elements more specific, used css selectors here to get the <a> that is child of an element with class="PagedList-skipToNext" :
next_button = soup.select_one('.PagedList-skipToNext a')
Also check the results of your selection, base_url is not needed here:
url = next_button.get('href')
Example
from bs4 import BeautifulSoup
import requests
def parse():
url = 'https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage'
while True:
soup = BeautifulSoup(requests.get(url).text)
print(url) ## to see what you are working on or enter code that should be performed
next_button = soup.select_one('.PagedList-skipToNext a')
if next_button:
url = next_button.get('href')
else:
break
parse()
Output
https://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=2
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=3
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=4
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=5
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=6
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=7
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=8
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=9
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=10
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=11
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=12
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=13
http://www.jurongpoint.com.sg/store-directory/?level=&cate=Food+%26+Beverage&page=14

how to scrape texts from voetsmart via beautifulsoup

I am trying to scrape some statements made by U.S politicians on votesmart.org
I am experiencing errors in extracting the texts though the code could be run.
The code that I am using is as follow:
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
import os
def main():
df=pd.read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')
df = df[df.type=='sen']
df = df[~df.votesmart_id.isna()]
done_list = os.listdir('corpus')
print("{} senators".format(len(df)))
df = df[~df.full_name.isin(done_list)]
print("{} after some already done".format(len(df)))
df = df.sample(frac=1)
df.apply(scrape_politician_speeches,axis=1)
def scrape_politician_speeches(row):
print('Scraping {}...'.format(row.full_name))
vs_url='https://justfacts.votesmart.org/candidate/public-statements/{}'.format(int(row.votesmart_id))
vs_page = requests.get(vs_url) # fill in the last part of the url
soup = BeautifulSoup(vs_page.content, features="lxml")
n_pages = 1
page_num = 1
while page_num <= n_pages:
print("\tPage {} of {}".format(page_num,n_pages))
#speeches_url = vs_page.url + '?start=2019-01-01&speechType=14&p={}'.format(page_num)
speeches_url = vs_page.url + '/?s=date&start=2020/01/01&end=&p={}'.format(page_num)
speeches_page = requests.get(speeches_url)
soup = BeautifulSoup(speeches_page.content, features="lxml")
speech_table = soup.find('table', {'id':'statementsObjectsTables'})
speech_table = soup.find('tbody')
speech_links = speech_table.find_all('a',href=True)
speech_hrefs = [a.get('href') for a in speech_links]
for href in speech_hrefs:
scrape_speech(person=row.full_name, speech_url=href)
try:
n_pages = int(soup.find('h7').text.split()[-1])
except:
print("\tNo page numbers")
pass
page_num += 1
sleep(1)
def scrape_speech(person, speech_url):
try:
if not os.path.isdir('corpus/{}'.format(person)):
os.mkdir('corpus/{}'.format(person))
speech_page = requests.get(speech_url)
soup = BeautifulSoup(speech_page.content,features="lxml")
title = soup.find('h3').text
date = soup.find('span',{'itemprop':'datePublished'}).text
location = soup.find('span',{'itemprop':'contentLocation'}).text
body = soup.find('div', {'class':"main clear"})
p_list = body.find_all('p')
text_list = [p.text for p in p_list]
speech_text = '\n\n'.join(text_list)
full_text = '{}\n\n\n{}'.format(title,speech_text)
file_name = '{}, {}, {}.txt'.format(title.split(',')[0], date, location)
file_name = file_name.replace('/',' ')
with open('corpus/{}/{}'.format(person,file_name), 'w') as f:
f.write(full_text)
except:
print("\tError with {}".format(speech_url))
if __name__=='__main__':
main()
The errors are looking like this:
95 senators
95 after some already done
Scraping Tammy Duckworth...
Page 1 of 1
Error with https://votesmart.org/public-statement/1570841/durbin-duckworth-announce-135-million-for-springfield-rail-improvement-project
Error with https://votesmart.org/public-statement/1570825/durbin-duckworth-statement-on-nomination-of-ladon-reynolds-to-serve-as-us-marshal-for-the-northern-district-of-illinois
Error with https://votesmart.org/public-statement/1570826/durbin-duckworth-announce-16-million-in-telehealth-funding-for-illinois-health-care-providers
Thank you so much for your time and attention. I hope to learn more from this wonderful community.
scrape_speech is outdated, probably pages' design changed since script was writen, there's no <div class="main clear"> in html, there's no <span itemprop="datePublished"> and so on. You need to rewrite it using current css selectors.

Why is Beautiful Soup Returning duplicate results?

I am creating a project that scrapes indeeds website and it was working fine but when I ran it today, all of a sudden without having made any changes, instead of returning the entire page of results, it no only displays the first result in duplicates. May someone help me correct this
from tkinter import *
import random
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import requests
html_text = requests.get('https://www.ign.com/').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find('section',class_='right')
#print(html_text)
driver = webdriver.Chrome(executable_path='/Users/Miscellaneous/PycharmProjects/RecursivePractice/chromedriver')
url= "https://www.indeed.com/jobs?q=developer&l=Westbury%2C%20NY&vjk=0b0cbe29e5f86422"
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
officials = soup.findAll("a",{"class":"tapItem"}
for official in officials:
jobTitle = soup.find('h2',{'class': 'jobTitle'}).text
companyName = soup.find('div',{'class': 'comapny_location'})
location = soup.find('div',{'class': 'companyLocation'}).text
salary = soup.find('div',{'class': 'salary-snippet'})
actualSalary = salary.find('span').text
summary = soup.find('div',{'class': 'job-snippet'}).text
print('Title: ' + str(jobTitle) + '\nCompany Name: ' + str(companyName) + '\nLocation: ' + str(location)
+ '\nSalary: ' + str(actualSalary) + "\nSummary: " + str(summary))
#print(str(official))
print(' ')
driver.quit()
Try this
from tkinter import *
import random
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import pandas as pd
import requests
html_text = requests.get('https://www.ign.com/').text
soup = BeautifulSoup(html_text, 'lxml')
jobs = soup.find('section',class_='right')
driver = webdriver.Chrome(executable_path='/Users/Miscellaneous/PycharmProjects/RecursivePractice/chromedriver')
url= "https://www.indeed.com/jobs?q=developer&l=Westbury%2C%20NY&vjk=0b0cbe29e5f86422"
driver.maximize_window()
driver.get(url)
time.sleep(5)
content = driver.page_source.encode('utf-8').strip()
soup = BeautifulSoup(content,"html.parser")
officials = soup.findAll("a",{"class":"tapItem"})
for i in range(len(officials)):
jobTitle = soup.findAll('h2',{'class': 'jobTitle'})[i].text
companyName = soup.findAll('div',{'class': 'comapny_location'})[i].text if len(soup.findAll('div',{'class': 'comapny_location'})) > i else "NULL"
location = soup.findAll('div',{'class': 'companyLocation'})[i].text if len(soup.findAll('div',{'class': 'companyLocation'})) > i else "NULL"
salary = soup.findAll('div',{'class': 'salary-snippet'})[i].text if len(soup.findAll('div',{'class': 'salary-snippet'})) > i else "NULL"
actualSalary = salary.find('span')
summary = soup.findAll('div',{'class': 'job-snippet'})[i].text if len(soup.findAll('div',{'class': 'job-snippet'})) > i else "NULL"
print('Title: ' + str(jobTitle) + '\nCompany Name: ' + str(companyName) + '\nLocation: ' + str(location)
+ '\nSalary: ' + str(actualSalary) + "\nSummary: " + str(summary))
print(' ')
driver.quit()

Beautiful soup doesn't scrape the data from the "next" pages

I am trying to scrape airbnb data using BeautifulSoup and Pandas. I checked a lot of tutorials and found the one I followed. The step in which the soup should scrape the data from the next page is not working, out of 15 pages, it scrapes only the first 2 or 3 pages or sometimes even none (even if the URLs of the pages are correct).
I cannot seem to understand why this happens and how to solve it. Can someone help out?
import requests
import bs4
import pandas as pd
import numpy as np
import csv
import time
url = 'https://www.airbnb.it/s/Italy/homes?checkin=2021-08-01&checkout=2021-08-02'
def get_page(url):
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, "html.parser")
return soup
def get_listings(soup):
result = []
result.extend(soup.find_all("div", {"class": "_8ssblpx"}))
return result
def get_listing_title(listing):
for l in listing:
try:
return str(l.find('div', {'class': '_1tanv1h'}).text)
except:
return None
def get_listing_subtitle(listing):
for l in listing:
try:
return str(l.find('span', {'class': '_1whrsux9'}).text)
except:
return None
def get_listing_info(listing):
for l in listing:
try:
return str(l.find_all('div', {'class': '_3c0zz1'})[0].text.lower())
except:
return None
def find_next_page(page):
base_url = "https://www.airbnb.it"
try:
nextpage = base_url + get_page(url).find_all("div", attrs={"class": "_jro6t0"})[0].find("a", attrs={'class':'_za9j7e'})['href']
except:
nextpage = None
return nextpage
title = []
subtitle = []
info = []
while url is not None:
soup = get_page(url)
listings = get_listings(soup)
for l in listings:
title.append(get_listing_title(l))
subtitle.append(get_listing_subtitle(l))
info.append(get_listing_info(l))
time.sleep(5)
url = find_next_page(soup)
print(url)
airbnb_data = pd.DataFrame(data = {'title': title,
'subtitle': subtitle,
'info': info})
airbnb_data

image_url = soup.select('.image a')[0]['href'] IndexError: list index out of range

I used part of this code to retrieve the images from imgur when the URL doesn't end in an image extension like .png/.jpg. However I am getting these errors. Please have a look and suggest fixes:
https://github.com/asweigart/imgur-hosted-reddit-posted-downloader/blob/master/imgur-hosted-reddit-posted-downloader.py
import datetime
import praw
import re
import urllib
import requests
from bs4 import BeautifulSoup
sub = 'dog'
imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?')
r = praw.Reddit(user_agent = "download all images from a subreddit",
user_site = "lamiastella")
already_done = []
#checkWords = ['i.imgur.com', 'jpg', 'png',]
check_words = ['jpg', 'png']
subreddit = r.get_subreddit(sub)
for submission in subreddit.get_hot(limit=10000):
is_image = any(string in submission.url for string in check_words)
print '[LOG] Getting url: ' + submission.url
if submission.id not in already_done and is_image:
if submission.url.endswith('/'):
modified_url = submission.url[:len(submission.url)-1]
try:
urllib.urlretrieve(modified_url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + modified_url[-4:])
except IOError:
pass
else:
try:
urllib.urlretrieve(submission.url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + submission.url[-4:])
except IOError:
pass
already_done.append(submission.id)
print '[LOG] Done Getting ' + submission.url
print('{0}: {1}'.format('submission id is', submission.id))
elif 'http://imgur.com/' in submission.url:
# This is an Imgur page with a single image.
html_source = requests.get(submission.url).text # download the image's page
soup = BeautifulSoup(html_source, "lxml")
image_url = soup.select('.image a')[0]['href']
if image_url.startswith('//'):
# if no schema is supplied in the url, prepend 'http:' to it
image_url = 'http:' + image_url
image_id = image_url[image_url.rfind('/') + 1:image_url.rfind('.')]
urllib.urlretrieve(image_url, '/home/jalal/computer_vision/image_retrieval/images/' + datetime.datetime.now().strftime('%y-%m-%d-%s') + image_url[-4:])
The error is:
[LOG] Getting url: http://imgur.com/a/yOLjm
Traceback (most recent call last):
File "download_images.py", line 43, in <module>
image_url = soup.select('.image a')[0]['href']
IndexError: list index out of range
You have wrong selection - you need soup.select('img')[0]['src']
import datetime
import urllib
import requests
from bs4 import BeautifulSoup
url = 'http://imgur.com/gyUWtWP'
html_source = requests.get(url).text
soup = BeautifulSoup(html_source, "lxml")
image_url = soup.select('img')[0]['src']
if image_url.startswith('//'):
image_url = 'http:' + image_url
image_id = image_url[image_url.rfind('/') + 1:image_url.rfind('.')]
urllib.urlretrieve(image_url, datetime.datetime.now().strftime('%y-%m-%d-%s') + image_url[-4:])

Categories