Extract text from custom <h2> in <div> elements by BeautifulSoup

Extract text from custom <h2> in <div> elements by BeautifulSoup - python

Hi i try to extract the name from h2 but an error occurs and names are extracted from other <h2> I want to extract names from <h2> specified from only <div class="poap serp-container lawyer"><div class="gray_border"><div class="col-lg-8 col-md-8 col-sm-9 col-xs-8 text_container"><h2 class=""indigo_text>Hi My name is Mark</h2></div></div></div>
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://attorneys.superlawyers.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 20):
print("page ended, terminate")
break
names = soup.find_all("h2", {"class":"indigo_text"})
for i in range(len(names)) :
name.append(names[i].text.strip())
links.append(names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["name","phone","website","logo"])
wr.writerows(exported)
I hope you guys can help me solve this problem

Select your tag more specific for example with following css selector:
names = soup.select('div.poap h2')
or with all the classes:
names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
Note This answer just focus to main point in question, code could be imporved to avoid some side effects.

Related

Extracting URL From Span Element Without href

I am attempting to extract links from a website that does not use a href. I have tried multiple iterations of trying to find the tag associated with the url that from what I can gather is between <span> elements.
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
page = requests.get(url)
f = open("test12.csv", "w")
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_ = 'listing-item-inner')
for list in lists:
title = list.find('span', class_ = '$0')
webs = list.find('#text', class_ = 'fa-fa.link')
address = list.find('ul', class_ = 'post-meta')
temp = list.find('span', class_ = 'text')
temp2 = list.find('i', class_ = '(text)')
info = [title, webs, address, temp, temp2]
f.write(str(info))
f.write("\n")
print(info)
The desired output is to extract data from <span></span> where the 345 40th Ave N and the url below i class = 'fa fa-link' and i class = 'fa fa-phone' where the three elements are placed into a CSV File

You could call next element e.find(class_ = 'fa-link').nextafter selecting the <i> with class fa-link:
for e in lists:
print(e.find(class_ = 'fa-link').next.strip() if e.find(class_ = 'fa-link') else '')
Note: Do not use reserved keywords like list and always check if element you are searching for is available.
Example
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
soup = BeautifulSoup(page.content, 'html.parser')
with open('somefile.csv', 'a', encoding='utf-8') as f:
for e in soup.find_all('div', class_ = 'listing-item-inner'):
title = e.h3.text
webs = e.select_one('.fa-link').next if e.select_one('.fa-link') else ''
address = e.span.text
phone = e.select_one('.fa-phone').next if e.select_one('.fa-phone') else ''
f.write(','.join([title, webs, address, phone])+'\n')

extract names in custom <h2> but It is extracted many times beautifulsoup

I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:

The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])

How to extract text within h4 strong?

I am trying to extract each "Overall Rating" (number value in strong tags) from each product page
https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue
The structure goes as follows:
<div class="col-sm-12">
<h2 class="line-bottom"> Customer Reviews</h2>
<h4>
Overall Rating
<strong>5</strong>
<span></span>
</h4>
</div>
I am trying to extract only the strong values.
productsRating = soup.find("div", {"class": "col-sm-12"}.h4
This sometimes works, but the page makes use of same class for different elements so it extracts un-wanted html elements.
Is there any solution to only getting the products overall reviews?
EDITED!!
this is the whole loop for my program.
for page in range(1, 2):
guitarPage = requests.get('https://www.guitarguitar.co.uk/guitars/electric/page-{}'.format(page)).text
soup = BeautifulSoup(guitarPage, 'lxml')
guitars = soup.find_all(class_='col-xs-6 col-sm-4 col-md-4 col-lg-3')
for guitar in guitars:
title_text = guitar.h3.text.strip()
print('Guitar Name: ', title_text)
price = guitar.find(class_='price bold small').text.strip()
trim = re.compile(r'[^\d.,]+')
int_price = trim.sub('', price)
print('Guitar Price: ', int_price)
priceSave = guitar.find('span', {'class': 'price save'})
if priceSave is not None:
priceOf = priceSave.text
trim = re.compile(r'[^\d.,]+')
int_priceOff = trim.sub('', priceOf)
print('Save: ', int_priceOff)
else:
print("No discount!")
image = guitar.img.get('src')
print('Guitar Image: ', image)
productLink = guitar.find('a').get('href')
linkProd = url + productLink
print('Link of product', linkProd)
productsPage.append(linkProd)
for products in productsPage:
response = requests.get(products)
soup = BeautifulSoup(response.content, "lxml")
productsDetails = soup.find("div", {"class": "description-preview"})
if productsDetails is not None:
description = productsDetails.text
print('product detail: ', description)
else:
print('none')
time.sleep(0.2)
productsRating = soup.find_all('strong')[0].text
print(productsRating)

Review info is all in a script tag you can extract and load with json. Simply enough to see how to fit that in a loop.
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
Output:
Explore json
To handle no reviews you could use a simply try except:
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
try:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
except: #you might want to use except KeyError
overall_rating = "None"
reviews = ['None']
or, use an if statement:
if 'aggregateRating' in script:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
else:
overall_rating = "None"
reviews = ['None']

Try:
import requests
from bs4 import BeautifulSoup
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
try:
productsRating = soup.find('h2', string=lambda s: "Customer reviews" in s).find_next_siblings()[0].find('strong').text
except:
productsRating = None
print(productsRating)

Scrape site with multiple links without "next" button using beautiful soup

I am very new to python (three days in) and I have stumbled into a problem I can't solve with google/youtube. I want to scrape the National Governors Association for background data of all US governors and save this into a csv file.
I have managed to scrape a list of all governors, but to get more details I need to enter the page of each governor individually and save the data. I have found code suggestions online which utilises a "next" button or the url structure to loop over several sites. This website, however, does not have a next button and the url-links does not follow a loopable structure. So I am stuck.
I would appreciate any help I can get very much. I want to extract the info above the main text (Office Dates, School(s) etc in the "address" tag) in each governors page, for example in this one.
This is what I have got so far:
import bs4 as bs
import urllib.request
import pandas as pd
url = 'https://www.nga.org/cms/FormerGovBios?begincac77e09-db17-41cb-9de0-687b843338d0=10&endcac77e09-db17-41cb-9de0-687b843338d0=9999&pagesizecac77e09-db17-41cb-9de0-687b843338d0=10&militaryService=&higherOfficesServed=&religion=&lastName=&sex=Any&honors=&submit=Search&college=&firstName=&party=&inOffice=Any&biography=&warsServed=&'
sauce = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(sauce, "html.parser")
#dl list of all govs
dfs = pd.read_html(url, header=0)
for df in dfs:
df.to_csv('governors.csv')
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
with open ('governors_links.csv', 'w') as r:
for link in links:
r.write(link['href'])
r.write('\n')
r.close()
#enter each gov page and extract data in the "address" tag(s)
#save this in a csv file

I'm assuming that you've got all the links in a list named links.
You can do this to get the data you want of all the Governors one by one:
for link in links:
r = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(r, 'html.parser')
print(soup.find('h2').text) # Name of Governor
for p in soup.find('div', {'class': 'col-md-3'}).findAll('p'):
print(p.text.strip()) # Office dates, address, phone, ...
for p in soup.find('div', {'class': 'col-md-7'}).findAll('p'):
print(p.text.strip()) # Family, school, birth state, ...
Edit:
Change your links list to
links = ['https://www.nga.org' + x.get('href') for x in table.findAll('a')]

This may work. I haven't tested it out to full completion since I'm at work but it should be a starting point for you.
import bs4 as bs
import requests
import re
def is_number(s):
try:
int(s)
return True
except ValueError:
return False
def main():
url = 'https://www.nga.org/cms/FormerGovBios?inOffice=Any&state=Any&party=&lastName=&firstName=&nbrterms=Any&biography=&sex=Any&religion=&race=Any&college=&higherOfficesServed=&militaryService=&warsServed=&honors=&birthState=Any&submit=Search'
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce, "html.parser")
finished = False
csv_data = open('Govs.csv', 'a')
csv_data.write('Name,Address,OfficeDates,Success,Address,Phone,Fax,Born,BirthState,Party,Schooling,Email')
try:
while not finished:
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
for link in links:
info_array = []
gov = {}
name = link.string
gov_sauce = requests.get(r'https://nga.org'+link.get('href')).text
gov_soup = bs.BeautifulSoup(gov_sauce, "html.parser")
#print(gov_soup)
office_and_stuff_info = gov_soup.findAll('address')
for address in office_and_stuff_info:
infos = address.findAll('p')
for info in infos:
tex = re.sub('[^a-zA-Z\d:]','',info.text)
tex = re.sub('\\s+',' ',info.text)
tex = tex.strip()
if tex:
info_array.append(tex)
info_array = list(set(info_array))
gov['Name'] = name
secondarry_address = ''
gov['Address'] = ''
for line in info_array:
if 'OfficeDates:' in line:
gov['OfficeDates'] = line.replace('OfficeDates:','').replace('-','')
elif 'Succ' or 'Fail' in line:
gov['Success'] = line
elif 'Address' in line:
gov['Address'] = line.replace('Address:','')
elif 'Phone:' or 'Phone ' in line:
gov['Phone'] = line.replace('Phone ','').replace('Phone: ','')
elif 'Fax:' in line:
gov['Fax'] = line.replace('Fax:','')
elif 'Born:' in line:
gov['Born'] = line.replace('Born:','')
elif 'Birth State:' in line:
gov['BirthState'] = line.replace('BirthState:','')
elif 'Party:' in line:
gov['Party'] = line.replace('Party:','')
elif 'School(s)' in line:
gov['Schooling'] = line.replace('School(s):','').replace('School(s) ')
elif 'Email:' in line:
gov['Email'] = line.replace('Email:','')
else:
secondarry_address = line
gov['Address'] = gov['Address'] + secondarry_address
data_line = gov['Name'] +','+gov['Address'] +','+gov['OfficeDates'] +','+gov['Success'] +','+gov['Address'] +','+ gov['Phone'] +','+ gov['Fax'] +','+gov['Born'] +','+gov['BirthState'] +','+gov['Party'] +','+gov['Schooling'] +','+gov['Email']
csv_data.write(data_line)
next_page_link = soup.find('ul','pagination center-blockdefault').find('a',{'aria-label':'Next'})
if next_page_link.parent.get('class') == 'disabled':
finished = True
else:
url = r'https://nga.org'+next_page_link.get('href')
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce,'html.parser')
except:
print('Code failed.')
finally:
csv_data.close()
if __name__ == '__main__':
main()

none returned when trying to get tag value

In this html snippet from https://letterboxd.com/shesnicky/list/top-50-favourite-films/, I'm trying to go through all the different li tags and get the info from 'data-target-link' so I can then use that to create a new link that takes me to the page for that film, however every time I try and get the data it simply returns None or an error along those lines.
<li class="poster-container numbered-list-item" data-owner-rating="10"> <div class="poster film-poster really-lazy-load" data-image-width="125" data-image-height="187" data-film-slug="/film/donnie-darko/" data-linked="linked" data-menu="menu" data-target-link="/film/donnie-darko/" > <img src="https://s3.ltrbxd.com/static/img/empty-poster-125.c6227b2a.png" class="image" width="125" height="187" alt="Donnie Darko"/><span class="frame"><span class="frame-title"></span></span> </div> <p class="list-number">1</p> </li>
I'm going to be using the links to grab imgs for a twitter bot, so I tried doing this within my code:
class BotStreamer(tweepy.StreamListener):
print "Bot Streamer"
#on_data method of Tweepy’s StreamListener
#passes data from statuses to the on_status method
def on_status(self, status):
print "on status"
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
movies_ul = soup.find('ul', {'class':'poster-list -p125 -grid film-list'})
movies = []
for mov in movies_ul.find('data-film-slug'):
movies.append(mov)
rand = randint(0,51)
newLink = "https://letterboxd.com%s" % (str(movies[rand]))
newPage = requests.get(newLink)
code = BS(newPage.content, 'html.parser')
code_div = code.find\
('div', {'class':'react-component film-poster film-poster-51910 poster'})
image = code_div.find('img')
url = image.get('src')
username = status.user.screen_name
status_id = status.id
tweet_reply(url, username, status_id)
However, I kept getting errors about list being out of range, or not being able to iterate over NoneType. So I made a test prgrm just to see if I could somehow get the data:
import requests
from bs4 import BeautifulSoup as BS
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
movies_ul = soup.find('ul', {'class':'poster-list -p125 -grid film-list'})
more = movies_ul.find('li', {'class':'poster-container numbered-list-item'})
k = more.find('data-target-link')
print k
And again, all I get is None. Any help greatly appreciated.

Read doc: find() as first argument expects tag name, not attribute.
You may do
soup.find('div', {'data-target-link': True})
or
soup.find(attrs={'data-target-link': True})
Full example
import requests
from bs4 import BeautifulSoup as BS
link = 'https://letterboxd.com/shesnicky/list/top-50-favourite-films/'
page = requests.get(link)
soup = BS(page.content, 'html.parser')
all_items = soup.find_all('div', {'data-target-link': True})
for item in all_items:
print(item['data-target-link'])

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract text from custom <h2> in <div> elements by BeautifulSoup - python

Related

Extracting URL From Span Element Without href

extract names in custom <h2> but It is extracted many times beautifulsoup

How to extract text within h4 strong?

Scrape site with multiple links without "next" button using beautiful soup

none returned when trying to get tag value

Categories

Resources