extract names in custom <h2> but It is extracted many times beautifulsoup - python

I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:

The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])

Related

Extracting URL From Span Element Without href

I am attempting to extract links from a website that does not use a href. I have tried multiple iterations of trying to find the tag associated with the url that from what I can gather is between <span> elements.
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
page = requests.get(url)
f = open("test12.csv", "w")
soup = BeautifulSoup(page.content, 'html.parser')
lists = soup.find_all('div', class_ = 'listing-item-inner')
for list in lists:
title = list.find('span', class_ = '$0')
webs = list.find('#text', class_ = 'fa-fa.link')
address = list.find('ul', class_ = 'post-meta')
temp = list.find('span', class_ = 'text')
temp2 = list.find('i', class_ = '(text)')
info = [title, webs, address, temp, temp2]
f.write(str(info))
f.write("\n")
print(info)
The desired output is to extract data from <span></span> where the 345 40th Ave N and the url below i class = 'fa fa-link' and i class = 'fa fa-phone' where the three elements are placed into a CSV File
You could call next element e.find(class_ = 'fa-link').nextafter selecting the <i> with class fa-link:
for e in lists:
print(e.find(class_ = 'fa-link').next.strip() if e.find(class_ = 'fa-link') else '')
Note: Do not use reserved keywords like list and always check if element you are searching for is available.
Example
import requests
from bs4 import BeautifulSoup
url = 'https://www.flavortownusa.com/locations'
soup = BeautifulSoup(page.content, 'html.parser')
with open('somefile.csv', 'a', encoding='utf-8') as f:
for e in soup.find_all('div', class_ = 'listing-item-inner'):
title = e.h3.text
webs = e.select_one('.fa-link').next if e.select_one('.fa-link') else ''
address = e.span.text
phone = e.select_one('.fa-phone').next if e.select_one('.fa-phone') else ''
f.write(','.join([title, webs, address, phone])+'\n')

Extract text from custom <h2> in <div> elements by BeautifulSoup

Hi i try to extract the name from h2 but an error occurs and names are extracted from other <h2> I want to extract names from <h2> specified from only <div class="poap serp-container lawyer"><div class="gray_border"><div class="col-lg-8 col-md-8 col-sm-9 col-xs-8 text_container"><h2 class=""indigo_text>Hi My name is Mark</h2></div></div></div>
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://attorneys.superlawyers.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 20):
print("page ended, terminate")
break
names = soup.find_all("h2", {"class":"indigo_text"})
for i in range(len(names)) :
name.append(names[i].text.strip())
links.append(names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["name","phone","website","logo"])
wr.writerows(exported)
I hope you guys can help me solve this problem
Select your tag more specific for example with following css selector:
names = soup.select('div.poap h2')
or with all the classes:
names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
Note This answer just focus to main point in question, code could be imporved to avoid some side effects.

Get data from product page and back Scraper

class Crawler():
def __init__(self):
self.pag = 1
i = 0
def get_urls(self,main_url):
self.url = 'https://www.test.ro/search/'+ main_url +'/p1'
self.filename = main_url
r = requests.get(self.url)
soup = BeautifulSoup(r.text, 'html.parser')
number_pages = soup.find(class_= 'row' )
last_page = number_pages.find_all('a')[len(number_pages.find_all('a'))-2].get("data-page")
for i in range(1, int(last_page)+1):
url.append('https://www.test.ro/search/'+ main_url +'/p' + str(i))
def print_urls(self):
for urls in url:
print (urls)
def scrape(self,url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
product_list = soup.find(class_ = 'page-container')
product_list_name = product_list.find_all('h2')
product_list_oldprice = product_list.find_all(class_ = 'product-old-price')
product_list_newprice = product_list.find_all(class_ = 'product-new-price')
for i in range(0, len(product_list_name)):
name = product_list_name[i].get_text().strip()
link = product_list_name[i].find('a').get('href')
#print(name)
#print(len(name))
try:
price = product_list_oldprice[i].contents[0].get_text()
price = price[:-6]
#print(price)
except IndexError:
#print("no old price")
#print(product_list_newprice[i].contents[0])
with open(self.filename+'.csv', 'a', encoding = 'utf-8', newline='') as csv_file:
file_is_empty = os.stat(self.filename+'.csv').st_size == 0
fieldname = ['name','link', 'price_old', 'price_actualy']
writer = csv.DictWriter(csv_file, fieldnames = fieldname)
if file_is_empty:
writer.writeheader()
writer.writerow({'name':name,'link':link, 'price_old':price, 'price_actualy':product_list_newprice[i].contents[0]})
if __name__=='__main__':
print("Search for product: ")
urlsearch = input()
starttime = time.time()
scraper = Crawler()
scraper.get_urls(urlsearch)
scraper.print_urls()
#scraper.scrape(url[0])
pool = multiprocessing.Pool()
pool.map(scraper.scrape,url)
pool.close()
print('That took {} seconds'.format(time.time() - starttime))
So I have this scraper, it works perfectly on any website bag but only on the product page.
I did it for a specific website, but how could I go on each page to take the data from the product and give it back and do it all over again?
Is such a thing possible?
I now take the data from the products page, ie name, link, price.
You have divs there too.
Can I help href?
In this case you need to create a category scraper that safes all product urls first. Scrape all urls and go through all the category's and for example safe them to csv first (the product urls). Then you can take all the product urls from the CSV and loop through all of them.

How to make request to new url?

I already have this code, helped by a friend before. I already get all the links in the site. I want to get the name, merk, price, picture, description of the product, and the link of the product. The description's product only appear if we click the product.
I'm a beginner in Python.
from bs4 import BeautifulSoup
import urllib.request
count = 1
url = "https://www.sociolla.com/155-foundation?p=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
link = []
name = []
merk = []
price = []
pic = []
description = []
while (response.url == expected_url):
#print("GET {0}".format(expected_url))
soup = BeautifulSoup(response.read(), "html.parser")
products = soup.find("div",{"id":"product-list-grid"})
for i in products:
data = products.findAll("div",{"class":"product-item"})
for j in range(0, len(data)):
link.append(data[j]["data-eec-href"])
count += 1
expected_url = url % count
response = get_url(expected_url)
print(len(link))
"""
import csv
dataset=zip(link, merk, name, pic, price, description)
with open("foundation_sociolla.csv","w", newline='') as csvfile:
writer=csv.writer(csvfile)
header=['link', 'merk', 'name', 'pic', 'price', 'description']
writer.writerow(header)
writer.writerows(dataset)
"""
You need to make a request to the URL. Parse the content of that request and extract the data you want.
from bs4 import BeautifulSoup
import urllib.request
count = 1
url = "https://www.sociolla.com/155-foundation?p=%d"
def get_url(url):
req = urllib.request.Request(url)
return urllib.request.urlopen(req)
expected_url = url % count
response = get_url(expected_url)
link = []
name = []
make = []
price = []
pic = []
description = []
while response.url == expected_url:
soup = BeautifulSoup(response.read(), "html.parser")
for product in soup.select("div.product-item"):
product_url = (product['data-eec-href'])
link.append(product_url)
product_response = get_url(product_url)
product_soup = BeautifulSoup(product_response.read(), "html.parser")
product_pic = product_soup.select('img#bigpic')[0]['src']
pic.append(product_pic)
product_price = product_soup.select('span#our_price_display')[0].text.strip()
price.append(product_price)
product_name = product_soup.select('div.detail-product-logo p')[0].text.strip()
name.append(product_name)
product_make = product_soup.select('div.detail-product-logo h3')[0].text.strip()
make.append(product_make)
product_description = product_soup.select('div#Details article')[0].text.strip()
description.append(product_description)
print(product_url, product_pic, product_price, product_name, product_make, product_description)
count += 1
expected_url = url % count
response = get_url(expected_url)
But if your going to scrape a lot of pages you are much better off using something like Scrapy https://scrapy.org/

Scrape site with multiple links without "next" button using beautiful soup

I am very new to python (three days in) and I have stumbled into a problem I can't solve with google/youtube. I want to scrape the National Governors Association for background data of all US governors and save this into a csv file.
I have managed to scrape a list of all governors, but to get more details I need to enter the page of each governor individually and save the data. I have found code suggestions online which utilises a "next" button or the url structure to loop over several sites. This website, however, does not have a next button and the url-links does not follow a loopable structure. So I am stuck.
I would appreciate any help I can get very much. I want to extract the info above the main text (Office Dates, School(s) etc in the "address" tag) in each governors page, for example in this one.
This is what I have got so far:
import bs4 as bs
import urllib.request
import pandas as pd
url = 'https://www.nga.org/cms/FormerGovBios?begincac77e09-db17-41cb-9de0-687b843338d0=10&endcac77e09-db17-41cb-9de0-687b843338d0=9999&pagesizecac77e09-db17-41cb-9de0-687b843338d0=10&militaryService=&higherOfficesServed=&religion=&lastName=&sex=Any&honors=&submit=Search&college=&firstName=&party=&inOffice=Any&biography=&warsServed=&'
sauce = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(sauce, "html.parser")
#dl list of all govs
dfs = pd.read_html(url, header=0)
for df in dfs:
df.to_csv('governors.csv')
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
with open ('governors_links.csv', 'w') as r:
for link in links:
r.write(link['href'])
r.write('\n')
r.close()
#enter each gov page and extract data in the "address" tag(s)
#save this in a csv file
I'm assuming that you've got all the links in a list named links.
You can do this to get the data you want of all the Governors one by one:
for link in links:
r = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(r, 'html.parser')
print(soup.find('h2').text) # Name of Governor
for p in soup.find('div', {'class': 'col-md-3'}).findAll('p'):
print(p.text.strip()) # Office dates, address, phone, ...
for p in soup.find('div', {'class': 'col-md-7'}).findAll('p'):
print(p.text.strip()) # Family, school, birth state, ...
Edit:
Change your links list to
links = ['https://www.nga.org' + x.get('href') for x in table.findAll('a')]
This may work. I haven't tested it out to full completion since I'm at work but it should be a starting point for you.
import bs4 as bs
import requests
import re
def is_number(s):
try:
int(s)
return True
except ValueError:
return False
def main():
url = 'https://www.nga.org/cms/FormerGovBios?inOffice=Any&state=Any&party=&lastName=&firstName=&nbrterms=Any&biography=&sex=Any&religion=&race=Any&college=&higherOfficesServed=&militaryService=&warsServed=&honors=&birthState=Any&submit=Search'
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce, "html.parser")
finished = False
csv_data = open('Govs.csv', 'a')
csv_data.write('Name,Address,OfficeDates,Success,Address,Phone,Fax,Born,BirthState,Party,Schooling,Email')
try:
while not finished:
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
for link in links:
info_array = []
gov = {}
name = link.string
gov_sauce = requests.get(r'https://nga.org'+link.get('href')).text
gov_soup = bs.BeautifulSoup(gov_sauce, "html.parser")
#print(gov_soup)
office_and_stuff_info = gov_soup.findAll('address')
for address in office_and_stuff_info:
infos = address.findAll('p')
for info in infos:
tex = re.sub('[^a-zA-Z\d:]','',info.text)
tex = re.sub('\\s+',' ',info.text)
tex = tex.strip()
if tex:
info_array.append(tex)
info_array = list(set(info_array))
gov['Name'] = name
secondarry_address = ''
gov['Address'] = ''
for line in info_array:
if 'OfficeDates:' in line:
gov['OfficeDates'] = line.replace('OfficeDates:','').replace('-','')
elif 'Succ' or 'Fail' in line:
gov['Success'] = line
elif 'Address' in line:
gov['Address'] = line.replace('Address:','')
elif 'Phone:' or 'Phone ' in line:
gov['Phone'] = line.replace('Phone ','').replace('Phone: ','')
elif 'Fax:' in line:
gov['Fax'] = line.replace('Fax:','')
elif 'Born:' in line:
gov['Born'] = line.replace('Born:','')
elif 'Birth State:' in line:
gov['BirthState'] = line.replace('BirthState:','')
elif 'Party:' in line:
gov['Party'] = line.replace('Party:','')
elif 'School(s)' in line:
gov['Schooling'] = line.replace('School(s):','').replace('School(s) ')
elif 'Email:' in line:
gov['Email'] = line.replace('Email:','')
else:
secondarry_address = line
gov['Address'] = gov['Address'] + secondarry_address
data_line = gov['Name'] +','+gov['Address'] +','+gov['OfficeDates'] +','+gov['Success'] +','+gov['Address'] +','+ gov['Phone'] +','+ gov['Fax'] +','+gov['Born'] +','+gov['BirthState'] +','+gov['Party'] +','+gov['Schooling'] +','+gov['Email']
csv_data.write(data_line)
next_page_link = soup.find('ul','pagination center-blockdefault').find('a',{'aria-label':'Next'})
if next_page_link.parent.get('class') == 'disabled':
finished = True
else:
url = r'https://nga.org'+next_page_link.get('href')
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce,'html.parser')
except:
print('Code failed.')
finally:
csv_data.close()
if __name__ == '__main__':
main()

Categories