Extracting URL From Span Element Without href

Extracting URL From Span Element Without href - python

I am attempting to extract links from a website that does not use a href. I have tried multiple iterations of trying to find the tag associated with the url that from what I can gather is between <span> elements.
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
page = requests.get(url)
f = open("test12.csv", "w")
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_ = 'listing-item-inner')
for list in lists:
title = list.find('span', class_ = '$0')
webs = list.find('#text', class_ = 'fa-fa.link')
address = list.find('ul', class_ = 'post-meta')
temp = list.find('span', class_ = 'text')
temp2 = list.find('i', class_ = '(text)')
info = [title, webs, address, temp, temp2]
f.write(str(info))
f.write("\n")
print(info)
The desired output is to extract data from <span></span> where the 345 40th Ave N and the url below i class = 'fa fa-link' and i class = 'fa fa-phone' where the three elements are placed into a CSV File

You could call next element e.find(class_ = 'fa-link').nextafter selecting the <i> with class fa-link:
for e in lists:
print(e.find(class_ = 'fa-link').next.strip() if e.find(class_ = 'fa-link') else '')
Note: Do not use reserved keywords like list and always check if element you are searching for is available.
Example
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
soup = BeautifulSoup(page.content, 'html.parser')
with open('somefile.csv', 'a', encoding='utf-8') as f:
for e in soup.find_all('div', class_ = 'listing-item-inner'):
title = e.h3.text
webs = e.select_one('.fa-link').next if e.select_one('.fa-link') else ''
address = e.span.text
phone = e.select_one('.fa-phone').next if e.select_one('.fa-phone') else ''
f.write(','.join([title, webs, address, phone])+'\n')

Related

Webscraping with BS4 NoneType object has no attribute find

I'm not sure why my code isn't working. I get AttributeError: 'NoneType' object has no attribute 'find'
My code is as follows:
import requests
from bs4 import BeautifulSoup
import csv
root_url = "https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=0"
html = requests.get(root_url)
soup = BeautifulSoup(html.text, 'html.parser')
paging = soup.find("nav",{"aria-label":"pagination-heading-3"}).find("li",{"class":"page-item"}).find_all("a")
start_page = paging[1].text
last_page = paging[len(paging)-2].text
outfile = open('congregationlookup.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Name", "Address", "Phone"])
pages = list(range(1,int(last_page)+1))
for page in pages:
url = 'https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=%s' %(page)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup.prettify())
print ('Processing page: %s' %(page))
name_list = soup.findAll("div",{"class":"views-field views-field-congregation"})
for element in name_list:
name = element.find('h3').text
address = element.find('field-content mb-2').text.strip()
phone = element.find("i",{"class":"fa fa-phone mr-1"}).text.strip()
writer.writerow([name, address, phone])
outfile.close()
print ('Done')
I'm trying to scrape the name, address, and phone number from the URJ Congregations website.
Thank you

Final code
import csv
import requests
from bs4 import BeautifulSoup
# root_url = "https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=0"
# html = requests.get(root_url)
# soup = BeautifulSoup(html.text, 'html.parser')
# paging = soup.find("nav", {"aria-label": "pagination-heading--3"}).find("ul", {"class": "pagination"}).find_all("a")
# start_page = paging[1].text
# last_page = paging[len(paging) - 3].text
outfile = open('congregationlookup.csv', 'w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Name", "Address", "Phone"])
pages = list(range(1, 1000))
for page in pages:
url = 'https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=%s' % (
page)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# print(soup.prettify())
print('Processing page: %s' % (page))
elements = soup.find_all("div", {"class": "views-row"})
if len(elements) == 0:
break
for element in elements:
name = element.find("div", {"class": "views-field views-field-congregation"}).text.strip()
address = element.find("div", {"class": "views-field views-field-country"}).text.strip()
phone = element.find("div", {"class": "views-field views-field-website"}).text.strip().split("\n")[0]
writer.writerow([name, address, phone])
outfile.close()
print('Done')

Most likely, your name_list contains a None type. So, when you attempt to run element.find(), you are performing a string operation on a None, hence your error.
https://docs.python.org/3/library/stdtypes.html#str.find
Also as an FYI, findAll() is bs3 syntax. You should use find_all() Difference between "findAll" and "find_all" in BeautifulSoup

There is a load of problems
The first problem is
"pagination-heading--3"
istead of
"pagination-heading-3"
Next i changed
paging = soup.find("nav",{"aria-label":"pagination-heading-3"}).find("li",{"class":"page-item"}).find_all("a")
To
paging = soup.find("nav", {"aria-label": "pagination-heading--3"}).find("ul", {"class": "pagination"}).find_all("a")
This was the line where i swapped first problematic string. And also i changed the second search to find ul. You were trying to find 1 li and searching inside of it. This would have reproduced empty list
Next
last_page = paging[len(paging) - 3].text
as you are trying to get 3rd element from the end
It still doesn't work, i will keep updating

Extract text from custom <h2> in <div> elements by BeautifulSoup

Hi i try to extract the name from h2 but an error occurs and names are extracted from other <h2> I want to extract names from <h2> specified from only <div class="poap serp-container lawyer"><div class="gray_border"><div class="col-lg-8 col-md-8 col-sm-9 col-xs-8 text_container"><h2 class=""indigo_text>Hi My name is Mark</h2></div></div></div>
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://attorneys.superlawyers.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 20):
print("page ended, terminate")
break
names = soup.find_all("h2", {"class":"indigo_text"})
for i in range(len(names)) :
name.append(names[i].text.strip())
links.append(names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["name","phone","website","logo"])
wr.writerows(exported)
I hope you guys can help me solve this problem

Select your tag more specific for example with following css selector:
names = soup.select('div.poap h2')
or with all the classes:
names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
Note This answer just focus to main point in question, code could be imporved to avoid some side effects.

How to list out all the h2, h3, and p tags then create a dataframe to store them

I had given a website to scrape all of the key items
But the output I got is only for one item using BeautifulSoup4. So wonder if I need to use anything like soup.findall to extract all the key items in a list from the website.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
url=
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
column= soup.find(class_ = re.compile('columns is-multiline'))
print(column.prettify())
position = column.h2.text
company = column.h3.text
city_state= column.find_all('p')[-2].text
print (position, company, city_state)
Thank you.

Try this:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://realpython.github.io/fake-jobs/'
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
positions = [pos.text for pos in soup.find_all('h2')]
companies = [com.text for com in soup.find_all('h3')]
city_state0 = []
city_state1 = []
for p in soup.find_all('p', {'class' : 'location'}):
city_state0.append(p.text.split(',')[0].strip())
city_state1.append(p.text.split(',')[1].strip())
df = pd.DataFrame({
'city_state1': city_state0,
'city_state2': city_state1,
'companies' : companies,
'positions' : positions
})
print(df)
Output:

You need to use find_all to get all the elements like so. find only gets the first element.
titles = soup.find_all('h2', class_='title is-5')
companies = soup.find_all('h3', class_='subtitle is-6 company')
locations = soup.find_all('p', class_='location')
# loop over locations and extract the city and state
for location in locations:
city = location.split(', ')[0]
state = location.split(', ')[1]

How to extract text within h4 strong?

I am trying to extract each "Overall Rating" (number value in strong tags) from each product page
https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue
The structure goes as follows:
<div class="col-sm-12">
<h2 class="line-bottom"> Customer Reviews</h2>
<h4>
Overall Rating
<strong>5</strong>
<span></span>
</h4>
</div>
I am trying to extract only the strong values.
productsRating = soup.find("div", {"class": "col-sm-12"}.h4
This sometimes works, but the page makes use of same class for different elements so it extracts un-wanted html elements.
Is there any solution to only getting the products overall reviews?
EDITED!!
this is the whole loop for my program.
for page in range(1, 2):
guitarPage = requests.get('https://www.guitarguitar.co.uk/guitars/electric/page-{}'.format(page)).text
soup = BeautifulSoup(guitarPage, 'lxml')
guitars = soup.find_all(class_='col-xs-6 col-sm-4 col-md-4 col-lg-3')
for guitar in guitars:
title_text = guitar.h3.text.strip()
print('Guitar Name: ', title_text)
price = guitar.find(class_='price bold small').text.strip()
trim = re.compile(r'[^\d.,]+')
int_price = trim.sub('', price)
print('Guitar Price: ', int_price)
priceSave = guitar.find('span', {'class': 'price save'})
if priceSave is not None:
priceOf = priceSave.text
trim = re.compile(r'[^\d.,]+')
int_priceOff = trim.sub('', priceOf)
print('Save: ', int_priceOff)
else:
print("No discount!")
image = guitar.img.get('src')
print('Guitar Image: ', image)
productLink = guitar.find('a').get('href')
linkProd = url + productLink
print('Link of product', linkProd)
productsPage.append(linkProd)
for products in productsPage:
response = requests.get(products)
soup = BeautifulSoup(response.content, "lxml")
productsDetails = soup.find("div", {"class": "description-preview"})
if productsDetails is not None:
description = productsDetails.text
print('product detail: ', description)
else:
print('none')
time.sleep(0.2)
productsRating = soup.find_all('strong')[0].text
print(productsRating)

Review info is all in a script tag you can extract and load with json. Simply enough to see how to fit that in a loop.
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
Output:
Explore json
To handle no reviews you could use a simply try except:
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
try:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
except: #you might want to use except KeyError
overall_rating = "None"
reviews = ['None']
or, use an if statement:
if 'aggregateRating' in script:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
else:
overall_rating = "None"
reviews = ['None']

Try:
import requests
from bs4 import BeautifulSoup
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
try:
productsRating = soup.find('h2', string=lambda s: "Customer reviews" in s).find_next_siblings()[0].find('strong').text
except:
productsRating = None
print(productsRating)

Scrape through website and iterate over seach results to get specific data

I'm trying to work on a project to scrape www.boattrader.com to push 800 listings with the Make, Price, and Phone Number of each boat to a CSV file.
I'm looking for guidance on the best way to scrape the links to each boat listing from the search results and then parse through each individual page to grab the Make, Price and Phone number.
Any guidance would be much appreciated it!
Thanks again!
from bs4 import BeautifulSoup, SoupStrainer
import requests
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
for link in possible_links:
if link.has_attr('href'):
boat_links = link.attrs['href']
return boat_links
search_results = 'http://www.boattrader.com/search-results/NewOrUsed-any/Type-all/Zip-90007/Radius-2000/Sort-Length:DESC/Page-1,50'
boat_links = extract_from_search(search_results)
print boat_links #why does this only print one link? What would be the best way to iterate over the search results, so I can put those links into the boat_listing variable to grab the information I'm looking for?
def extract_from_listing(boat_listing):
r = requests.get(boat_listing)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
table_heads = soup.find_all('th')
for th in table_heads:
if th.text =="Make":
make = th.find_next_sibling("td").text
price = soup.find('span', {'class': 'bd-price'})
formatted_price = price.string.strip()
contact_info = soup.find('div', {'class': 'phone'})
reversed_phone = contact_info.string[::-1]
temp_phone = reversed_phone.replace(')', '}')
temp_phone2 = temp_phone.replace('(', ')')
correct_phone = temp_phone2.replace("}", "(")
return make, formatted_price, correct_phone
boat_listing = 'http://www.boattrader.com/listing/2009-Briggs-BR9134-Sportfish-102290211'
make, price, phone = extract_from_listing(boat_listing)
print make
print price
print phone

You are only returning the last link, you need to append:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
boat_links = [] # create list to append all inks to
for link in possible_links:
if link.has_attr('href'):
boat_links.append(link.attrs['href']) # append each link
return boat_links
Or use a list comp:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.content # use content to let requests handle the decoding
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
return [link.attrs['href'] for link in possible_links if link.has_attr('href')]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extracting URL From Span Element Without href - python

Related

Webscraping with BS4 NoneType object has no attribute find

Extract text from custom <h2> in <div> elements by BeautifulSoup

How to list out all the h2, h3, and p tags then create a dataframe to store them

How to extract text within h4 strong?

Scrape through website and iterate over seach results to get specific data

Categories

Resources