Fix for missing 'tr' class in webscraping

Fix for missing 'tr' class in webscraping - python

I'm trying to webscrape different stocks by rows, with the data scraped from https://www.slickcharts.com/sp500. I am following a tutorial using a similar website, however that website uses classes for each of its rows, while mine doesn't (attached below).
This is the code I'm trying to use, however I don't get any output whatsoever. I'm still pretty new at coding so any feedback is welcome.
import requests
import pandas as pd
from bs4 import BeautifulSoup
company = []
symbol = []
url = 'https://www.slickcharts.com/sp500' #Data from SlickCharts
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
rows = soup.find_all('tr')
for i in rows:
row = i.find_all('td')
print(row[0])

First of all, you need to add some headers to your request because most likely you get the same as me: status code 403 Forbidden. It's because the website is blocking your request. Adding User-Agent does the trick:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
}
page = requests.get(url, headers=headers)
Then you can iterate over tr tags as you do. But you should be careful, because, for example first tr doesn't have td tags and you will get exception in the row:
print(row[0])
Here is the example of code that prints names of all companies:
import requests
from bs4 import BeautifulSoup
company = []
symbol = []
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = 'https://www.slickcharts.com/sp500' #Data from SlickCharts
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
rows = soup.find_all('tr')
for row in rows:
all_td_tags = row.find_all('td')
if len(all_td_tags) > 0:
print(all_td_tags[1].text)
But this code also outputs some other data besides company names. It's because you are iterating over all tr tags on the page. But you need to iterate over a specific table only (first table on the page in this case).
import requests
from bs4 import BeautifulSoup
company = []
symbol = []
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url = 'https://www.slickcharts.com/sp500' #Data from SlickCharts
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
first_table_on_the_page = soup.find('table')
rows = first_table_on_the_page.find_all('tr')
for row in rows:
all_td_tags = row.find_all('td')
if len(all_td_tags) > 0:
print(all_td_tags[1].text)

Related

getting an empty list when trying to extract urls from google with beautifulsoup

I am trying to extract the first 100 urls that return from a location search in google
however i am getting an empty list every time ("no results found")
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = []
if results:
counter = 0
for result in results:
websites.append(result.find("a")["href"])
counter += 1
if counter == 100:
break
else:
print("No search results found.")
return websites
location = "Athens"
print(get_location_info(location))
No search results found.
[]
I have also tried this approach :
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.find_all("div", class_="r")
websites = [result.find("a")["href"] for result in results][:10]
return websites
location = "sifnos"
print(get_location_info(location))`
and i get an empty list. I think i am doing everything suggested in similar posts but i still get nothing

Always and first of all, take a look at your soup to see if all the expected ingredients are in place.
Select your elements more specific in this case for example with css selector:
[a.get('href') for a in soup.select('a:has(>h3)')]
To void consent banner also send some cookies:
cookies={'CONSENT':'YES+'}
Example
import requests
from bs4 import BeautifulSoup
def get_location_info(location):
query = location + " information"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
}
url = "https://www.google.com/search?q=" + query
response = requests.get(url, headers=headers, cookies={'CONSENT':'YES+'})
soup = BeautifulSoup(response.text, 'html.parser')
websites = [a.get('href') for a in soup.select('a:has(>h3)')]
return websites
location = "sifnos"
print(get_location_info(location))
Output
['https://www.griechenland.de/sifnos/', 'http://de.sifnos-greece.com/plan-trip-to-sifnos/travel-information.php', 'https://www.sifnosisland.gr/', 'https://www.visitgreece.gr/islands/cyclades/sifnos/', 'http://www.griechenland-insel.de/Hauptseiten/sifnos.htm', 'https://worldonabudget.de/sifnos-griechenland/', 'https://goodmorningworld.de/sifnos-griechenland/', 'https://de.wikipedia.org/wiki/Sifnos', 'https://sifnos.gr/en/sifnos/', 'https://www.discovergreece.com/de/cyclades/sifnos']

Looping through the page numbers with Python BeautifulSoup

Attempting to update my script so that it searches through not only the url provided but all of the pages in range (1-3) and adds them to the CSV. Can anyone spot why my current code would not be working? The addition to pages following 1 are in the following format: page-2
from bs4 import BeautifulSoup
import requests
from csv import writer
from random import randint
from time import sleep
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
for page in range(1, 4):
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for list in lists:
title = list.find('h2').text
price = list.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
sleep(randint(2,10))

You are overwrite req multiple times and end up only analyzing the results of page 2. Put everything inside your loop.
edit: Also the upper limit in range() is not included, so you probably want to do for page in range(1, 4): to get the first three pages.
edit full example:
from bs4 import BeautifulSoup
import requests
from csv import writer
url = "https://www.propertypal.com/property-for-sale/ballymena-area/page-"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for page in range(1, 4):
req = requests.get(f"{url}{page}", headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
for li in soup.find_all('li', class_="pp-property-box"):
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)

The solution from bitflip is fine, however a few things I'll point out to help you.
try to avoid variable names that are predefined functions in python. For example list being one of those.
while csv writer is a fine package to use, also consider using pandas. You will likely further down the road need to do some data manipulation and what not, so might as well familiarise yourself with the package now. It's a very powerful tool.
Here's how I would have coded it.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from random import randint
from time import sleep
from os.path import exists
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
# Check if csv file exists
file_exists = exists('ballymena.csv')
for page in range(1, 4):
rows = []
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
for li in lists:
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
row = {
'Address':title,
'Price':price}
rows.append(row)
df = pd.DataFrame(rows)
# If file doesnt exists, write initial file
if not file_exists:
df.to_csv('ballymena.csv', index=False)
file_exists = True
# If it already exists, ammend to file
else:
df.to_csv('ballymena.csv', mode = 'a', header = False, index = False)
sleep(randint(2,10))

How to extract text from the different id from beautifulsoup

I want to extract from the id but every id has different value check it:
div',id='statement80863
div',id='statement26092
and so on ............................
CODE
import requests
from bs4 import BeautifulSoup
import re
limit = 100
url = f'https://www.counselingcalifornia.com/cc/cgi-bin/utilities.dll/customlist?FIRSTNAME=~&LASTNAME=~&ZIP=&DONORCLASSSTT=&_MULTIPLE_INSURANCE=&HASPHOTOFLG=&_MULTIPLE_EMPHASIS=&ETHNIC=&_MULTIPLE_LANGUAGE=ENG&QNAME=THERAPISTLIST&WMT=NONE&WNR=NONE&WHP=therapistHeader.htm&WBP=therapistList.htm&RANGE=1%2F{limit}&SORT=LASTNAME'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
rows = soup.find_all('div', {'class':'row'})
for row in rows:
des=row.find('div',id='statement80863').text
print(des)

You can use Regular Expressions to select only such <div> tags.
row.find('div', {'id': re.compile('^statement.*')}) - will select all the <div> tags that has an id which starts with the word statement.
import re
import requests
from bs4 import BeautifulSoup
url = 'https://www.counselingcalifornia.com/cc/cgi-bin/utilities.dll/customlist?FIRSTNAME=~&LASTNAME=~&ZIP=&DONORCLASSSTT=&_MULTIPLE_INSURANCE=&HASPHOTOFLG=&_MULTIPLE_EMPHASIS=&ETHNIC=&_MULTIPLE_LANGUAGE=ENG&QNAME=THERAPISTLIST&WMT=NONE&WNR=NONE&WHP=therapistHeader.htm&WBP=therapistList.htm&RANGE=1%2F100&SORT=LASTNAME'
headers = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
rows = soup.find_all('div', class_='row')
for row in rows:
d = row.find('div', {'id': re.compile('^statement*')})
if d:
# Your scraping code here...

is get_text() from bs4 different for span tags? Cant remove span tags

Whilst making a web scraper i am able to find an scrape the data available.
on 2 fields of data i am able to use the beautifulsoup get_text() to reomve html from the data
but the 3rd fields will not work when i use get_text(). I can get it to give me the whole span tag just not the text inside it.
i have tried different iterations of getting the data all the same, it will give me the whole span tag ie. stuff
Trying to set busnumber to the phone number inside this span tag
<span class="business--telephoneNumber" itemprop="telephone">01430 422826 </span>
ive tried
from bs4 import BeautifulSoup
import requests
import csv
data_list=[]
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5);
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
#print(content)
questions = content.find_all(class_='businessCapsule')
for question in questions:
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnum = question.find('span', {'itemprop': 'telephone'})
print(busnum)
busnumber = busnum.get_text()
new_data = {"busname": busname, "bustype": bustype, "busnumber": busnumber}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["busname", "bustype", "busnumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
as well as
from bs4 import BeautifulSoup
import requests
import csv
data_list=[]
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5);
if site.status_code is 200:
content = BeautifulSoup(site.content, 'html.parser')
#print(content)
questions = content.find_all(class_='businessCapsule')
for question in questions:
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnumber = question.find('span', {'itemprop': 'telephone'}).get_text()
new_data = {"busname": busname, "bustype": bustype, "busnumber": busnumber}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["busname", "bustype", "busnumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)
on both cases the get_text() gives this error
Traceback (most recent call last):
File "webscraper2.py", line 22, in <module>
busnumber = busnum.get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'
if get_text is removed it will give the whole tag
<span class="business--telephoneNumber" itemprop="telephone">01430 422826 </span>
i only need the insed phone number.
update - latest code
from bs4 import BeautifulSoup as bs
import requests
import csv
data_list=[]
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5)
soup = bs(site.content, 'html.parser')
questions = soup.select('.businessCapsule--mainContent')
for question in questions:
busname = question.find(class_='businessCapsule--name').get_text()
bustype = question.find(class_='businessCapsule--classification').get_text()
busnumber = question.select_one('span.business--telephoneNumber').text
print(busnumber)
new_data = {"busname": busname, "bustype": bustype, "busnumber": busnumber}
data_list.append(new_data)
with open ('selector.csv','w') as file:
writer = csv.DictWriter(file, fieldnames = ["busname", "bustype", "busnumber"], delimiter = ';')
writer.writeheader()
for row in data_list:
writer.writerow(row)

You need to get a different parent in order to select the appropriate child and change your selector for the child as shown below:
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5)
soup = bs(site.content, 'lxml')
questions = soup.select('.businessCapsule--mainContent:has(span.business--telephoneNumber)')
for question in questions:
print(question.select_one('span.business--telephoneNumber').text)
If you check this different parent selector you will see it selects the entire box with info in so you can then select your various children
If that is too retrictive you can test if tel was present
import requests
from bs4 import BeautifulSoup as bs
url = 'https://www.yell.com/ucs/UcsSearchAction.do?keywords=farmer&location=leeds'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
site = requests.get(url, headers=headers, timeout=5)
soup = bs(site.content, 'lxml')
questions = soup.select('.businessCapsule--mainContent')
for question in questions:
tel = question.select_one('span.business--telephoneNumber')
if tel is None:
tel = 'Not present'
else:
tel = tel.text
print(tel)

Web scraping twitter

I want to do web scraping on twitter page to download tweets on a specific search word. I am not able to fetch recursively all the tweets, rather I can fetch 20 tweets. Please help to fetch all the tweets recursively. Below is the code
from bs4 import BeautifulSoup
import requests
import pandas as pd
company_name = 'ABC'
url = 'https://twitter.com/search?q=%23%27%20%20%20' + company_name + '&src=typd&lang=en'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
req = requests.get(url, headers=headers);#print(req)
data = req.text;# print(data)
# soup = BeautifulSoup(data, "lxml");# print(soup)
soup = BeautifulSoup(data, "html.parser");# print(soup)
tweets = [p.text for p in soup.findAll('p', class_='tweet-text')]
# print(tweets)
df = pd.DataFrame()
df['Tweet'] = tweets
print(df.head())
print(df.shape)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Fix for missing 'tr' class in webscraping - python

Related

getting an empty list when trying to extract urls from google with beautifulsoup

Looping through the page numbers with Python BeautifulSoup

How to extract text from the different id from beautifulsoup

is get_text() from bs4 different for span tags? Cant remove span tags

Web scraping twitter

Categories

Resources