How to create a looping url programmatically to scrape - python

I have this line of code I am trying to scrape; however, I am lost in how I can make the python code scrape a loop and save everything so I can .csv everything. Any help would be greatly appreciated:)
import requests
from bs4 import BeautifulSoup
url = url = "http://www.yellowpages.com/search?search_terms=bodyshop&geo_location_terms=Fort+Lauderdale%2C+FL"
soup = BeautifulSoup(r.content)
links = soup.find_all("a")
from link in links:
print "<a href='%s'>%s</a>" %(link.get("href"), link.text)
g_data = soup.find_all("div", {"class", "info"})
from item in g_data:
print item.content[0].find_all("a", {"class": "business-name"})[0].text
try:
print item.contents[1].find_all("span", {"itemprop": "streetAddress"})[0].text
except:
pass
try:
print item.contents[1].find_all("span", {"itemprop": "adressLocality"})[0].text.replace(',', '')
except:
pass
try:
print item.contents[1].find_all("span", {"itemprop": "adressRegion"})[0].text
except:
pass
try:
print item.contents[1].find_all("span", {"itemprop": "postalCode"})[0].text
except:
pass
try:
print item.contents[1].find_all("li", {"class": "primary"})[0].text
I know that with this code:
url_page2 = url + '&page=' + str(2) '&s=relevance'
I can loop to the second page, but how could one loop to all the page results of the website and make the results available in a .csv file?

Make an endless loop incrementing the page number starting from 1 and exit it when you'll get no results. Define a list of fields to extract and rely on the itemprop attribute to get the field values. Collect items in a list of dictionaries which you can later write into a csv file:
from pprint import pprint
import requests
from bs4 import BeautifulSoup
url = "http://www.yellowpages.com/search?search_terms=bodyshop&geo_location_terms=Fort%20Lauderdale%2C%20FL&page={page}&s=relevance"
fields = ["name", "streetAddress", "addressLocality", "addressRegion", "postalCode", "telephone"]
data = []
index = 1
while True:
url = url.format(page=index)
index += 1
response = requests.get(url)
soup = BeautifulSoup(response.content)
page_results = soup.select('div.result')
# exiting the loop if no results
if not page_results:
break
for item in page_results:
result = dict.fromkeys(fields)
for field in fields:
try:
result[field] = item.find(itemprop=field).get_text(strip=True)
except AttributeError:
pass
data.append(result)
break # DELETE ME
pprint(data)
For the first page, it prints:
[{'addressLocality': u'Fort Lauderdale,',
'addressRegion': u'FL',
'name': u"Abernathy's Paint And Body Shop",
'postalCode': u'33315',
'streetAddress': u'1927 SW 1st Ave',
'telephone': u'(954) 522-8923'},
...
{'addressLocality': u'Fort Lauderdale,',
'addressRegion': u'FL',
'name': u'Mega Auto Body Shop',
'postalCode': u'33304',
'streetAddress': u'828 NE 4th Ave',
'telephone': u'(954) 523-9331'}]

Related

Webscraping with BS4 NoneType object has no attribute find

I'm not sure why my code isn't working. I get AttributeError: 'NoneType' object has no attribute 'find'
My code is as follows:
import requests
from bs4 import BeautifulSoup
import csv
root_url = "https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=0"
html = requests.get(root_url)
soup = BeautifulSoup(html.text, 'html.parser')
paging = soup.find("nav",{"aria-label":"pagination-heading-3"}).find("li",{"class":"page-item"}).find_all("a")
start_page = paging[1].text
last_page = paging[len(paging)-2].text
outfile = open('congregationlookup.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Name", "Address", "Phone"])
pages = list(range(1,int(last_page)+1))
for page in pages:
url = 'https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=%s' %(page)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup.prettify())
print ('Processing page: %s' %(page))
name_list = soup.findAll("div",{"class":"views-field views-field-congregation"})
for element in name_list:
name = element.find('h3').text
address = element.find('field-content mb-2').text.strip()
phone = element.find("i",{"class":"fa fa-phone mr-1"}).text.strip()
writer.writerow([name, address, phone])
outfile.close()
print ('Done')
I'm trying to scrape the name, address, and phone number from the URJ Congregations website.
Thank you
Final code
import csv
import requests
from bs4 import BeautifulSoup
# root_url = "https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=0"
# html = requests.get(root_url)
# soup = BeautifulSoup(html.text, 'html.parser')
# paging = soup.find("nav", {"aria-label": "pagination-heading--3"}).find("ul", {"class": "pagination"}).find_all("a")
# start_page = paging[1].text
# last_page = paging[len(paging) - 3].text
outfile = open('congregationlookup.csv', 'w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Name", "Address", "Phone"])
pages = list(range(1, 1000))
for page in pages:
url = 'https://urj.org/urj-congregations?congregation=&distance_address_field=&distance_num_miles=5.0&worship_services=All&community=All&urj_camp_affiliations=All&page=%s' % (
page)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# print(soup.prettify())
print('Processing page: %s' % (page))
elements = soup.find_all("div", {"class": "views-row"})
if len(elements) == 0:
break
for element in elements:
name = element.find("div", {"class": "views-field views-field-congregation"}).text.strip()
address = element.find("div", {"class": "views-field views-field-country"}).text.strip()
phone = element.find("div", {"class": "views-field views-field-website"}).text.strip().split("\n")[0]
writer.writerow([name, address, phone])
outfile.close()
print('Done')
Most likely, your name_list contains a None type. So, when you attempt to run element.find(), you are performing a string operation on a None, hence your error.
https://docs.python.org/3/library/stdtypes.html#str.find
Also as an FYI, findAll() is bs3 syntax. You should use find_all() Difference between "findAll" and "find_all" in BeautifulSoup
There is a load of problems
The first problem is
"pagination-heading--3"
istead of
"pagination-heading-3"
Next i changed
paging = soup.find("nav",{"aria-label":"pagination-heading-3"}).find("li",{"class":"page-item"}).find_all("a")
To
paging = soup.find("nav", {"aria-label": "pagination-heading--3"}).find("ul", {"class": "pagination"}).find_all("a")
This was the line where i swapped first problematic string. And also i changed the second search to find ul. You were trying to find 1 li and searching inside of it. This would have reproduced empty list
Next
last_page = paging[len(paging) - 3].text
as you are trying to get 3rd element from the end
It still doesn't work, i will keep updating

Fix BeautifulSoup code to get data from all pages and output into csv

Complete beginner. Please help. I've got this code, which worked when I did not try to output to .csv but instead had a print command there - so I didn't have the last 2 lines or anything related to variable 'data'. By 'worked' I mean it printed data from all 18 pages.
Now it outputs data into .csv but only from the first page (url).
I see that I'm not passing nexturl into the pandas at the end - because I don't know how to. Help greatly appreciated.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.marketresearch.com/search/results.asp?qtype=2&datepub=3&publisher=Technavio&categoryid=0&sortby=r'
def scrape_it(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
data = []
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' not in stri:
print("All pages completed")
else:
scrape_it(nexturl)
return data
myOutput = pd.DataFrame(scrape_it(url))
myOutput.to_csv(f'results-tec6.csv', header=False)
Make data global so you keep appending to it during loop rather than re-creating afresh. Then make your recursive function be called outside the DataFrame() call so you can then pass data to pandas.
Finally, you can pass a cookie to get the max possible results per request to reduce the number of requests.
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.marketresearch.com/search/results.asp?qtype=2&datepub=3&publisher=Technavio&categoryid=0&sortby=r&page=1'
data = []
def scrape_it(url):
page = requests.get(url, headers = {'Cookie':'ResultsPerPage=100'})
soup = BeautifulSoup(page.text, 'html.parser')
nexturl = soup.find_all(class_="standardLinkDkBlue")[-1]['href']
stri = soup.find_all(class_="standardLinkDkBlue")[-1].string
reports = soup.find_all("tr", {"class": ["SearchTableRowAlt", "SearchTableRow"]})
for report in reports:
data.append({
'title': report.find('a', class_='linkTitle').text,
'price': report.find('div', class_='resultPrice').text,
'date_author': report.find('div', class_='textGrey').text.replace(' | published by: TechNavio', ''),
'detail_link': report.a['href']
})
if 'next' not in stri:
print("All pages completed")
else:
scrape_it(nexturl)
scrape_it(url)
myOutput = pd.DataFrame(data)
myOutput.to_csv(f'results-tec6.csv', header=False)

Selenium- BS4 : Facing issue in scraping webpage

I want to scrape the company info of all the companies from the below given URL and view their job details
URL : http://desiopt.com/search-results-jobs/
from selenium import webdriver
import bs4
import pandas as pd
from bs4 import BeautifulSoup
import re
driver = webdriver.Chrome(executable_path=r"C:/Users/Chandra Sekhar/Desktop/chrome-driver/chromedriver.exe")
titles=[]
driver.get("http://desiopt.com/search-results-jobs/")
content = driver.page_source
soup = BeautifulSoup(content)
for a in soup.findAll('div',attrs={'class':'listing-links'}):
info=a.find('div', attrs={'class':'userInfo'})
print(info.text)
titles.append(info.text)
df = pd.DataFrame({'Company info':titles})
df['Price'] = df['Price'].map(lambda x: re.sub(r'\W+', '', x))
df.to_csv('products1.csv', index=False)
Using the following url:
https://desiopt.com/search-results-jobs/?action=search&page=&listings_per_page=&view=list
Here's two parameter which you will edit page= and listings_per_page=:
Currently the website do have 37091 Jobs.
After my testing, I do see that listings_per_page is limited by 1000 per one page.
Example: https://desiopt.com/search-results-jobs/?action=search&page=1&listings_per_page=1000&view=list
So you will need to loop from page=1 to page=38 and set listings_per_page=1000
Which means 1000 result per page * 38 page = 38000
After that:
You will collect all links and pass it to list with condition to remove duplicates in case if you worry about sort. Otherwise just pass it to set which doesn't accept duplicates but it's don't care about sort. Then you can parse each url in list or set to collect the information.
By The Way, I'll loop over 371 page and each page include 100 items so i will get 37100 url (or less if the last page have little than 100 url) and remove the duplicates from, then parse:
import requests
from bs4 import BeautifulSoup
import csv
links = []
try:
for item in range(1, 372):
print(f"Extraction Page# {item}")
r = requests.get(
f"https://desiopt.com/search-results-jobs/?action=search&page={item}&listings_per_page=100&view=list")
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('span', attrs={'class': 'captions-field'}):
for a in item.findAll('a'):
a = a.get('href')
if a not in links:
links.append(a)
except KeyboardInterrupt:
print("Good Bye!")
exit()
data = []
try:
for link in links:
r = requests.get(link)
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll('div', attrs={'class': 'compProfileInfo'}):
a = [a.text.strip() for a in item.findAll('span')]
if a[6] == '':
a[6] = 'N/A'
data.append(a[0:7:2])
except KeyboardInterrupt:
print("Good Bye!")
exit()
while True:
try:
with open('output.csv', 'w+', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Name', 'Phone', 'Email', 'Website'])
writer.writerows(data)
print("Operation Completed")
except PermissionError:
print("Please Close The File")
continue
except KeyboardInterrupt:
print("Good Bye")
exit()
break
Result Can Be Viewed Here:
Click Here
The output is 1885 Rows because I've let the script to remove duplicated links for companies before i parsed.
Run Code Online: Click Here

Scrape multiple pages with Beautiful soup

I am trying to scrape multiple pages of a url.
But am able to scrape only the first page is there is a way to get all the pages.
Here is my code.
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, pandas as pd
pd.set_option('max_colwidth',500) # to remove column limit (Otherwise, we'll lose some info)
df = pd.DataFrame()
Comp_urls = ['https://www.indeed.com/jobs?q=Dell&rbc=DELL&jcid=0918a251e6902f97', 'https://www.indeed.com/jobs?q=Harman&rbc=Harman&jcid=4faf342d2307e9ed','https://www.indeed.com/jobs?q=johnson+%26+johnson&rbc=Johnson+%26+Johnson+Family+of+Companies&jcid=08849387e791ebc6','https://www.indeed.com/jobs?q=nova&rbc=Nova+Biomedical&jcid=051380d3bdd5b915']
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
comp_name = elem.find('span', attrs={'class':'company'}).getText().strip()
job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
home_url = "http://www.indeed.com"
job_link = "%s%s" % (home_url,elem.find('a').get('href'))
job_addr = elem.find('span', attrs={'class':'location'}).getText()
date_posted = elem.find('span', attrs={'class': 'date'}).getText()
description = elem.find('span', attrs={'class': 'summary'}).getText().strip()
comp_link_overall = elem.find('span', attrs={'class':'company'}).find('a')
if comp_link_overall != None:
comp_link_overall = "%s%s" % (home_url, comp_link_overall.attrs['href'])
else: comp_link_overall = None
df = df.append({'comp_name': comp_name, 'job_title': job_title,
'job_link': job_link, 'date_posted': date_posted,
'overall_link': comp_link_overall, 'job_location': job_addr, 'description': description
}, ignore_index=True)
df
df.to_csv('path\\web_scrape_Indeed.csv', sep=',', encoding='utf-8')
Please suggest if there is anyway.
Case 1: The code presented here is exactly what you have
Comp_urls = ['https://www.indeed.com/jobs?q=Dell&rbc=DELL&jcid=0918a251e6902f97', 'https://www.indeed.com/jobs?q=Harman&rbc=Harman&jcid=4faf342d2307e9ed','https://www.indeed.com/jobs?q=johnson+%26+johnson&rbc=Johnson+%26+Johnson+Family+of+Companies&jcid=08849387e791ebc6','https://www.indeed.com/jobs?q=nova&rbc=Nova+Biomedical&jcid=051380d3bdd5b915']
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
The problem here is targetElements changes with every iteration in the first for loop.
To avoid this, indent the second for loop inside the first like so:
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
Case 2: Your the bug is not a result of improper indentation (i.e. not like what is in your original post)
If it is the case that your code is properly idented , then it may be the case that targetElements is an empty list. This means target.findAll('div', class_ =' row result') does not return anything. In that case, visit the sites, check out the dom, then modify your scraping program.

Pagination with BeautifulSoup

I am trying to get some data from the following website. https://www.drugbank.ca/drugs
For every drug in the table, I will need to go deeply and have the name and some other specific features like categories, structured indication (please click on drug name to see the features I will use).
I wrote the following code but the issue that I can't make my code handle pagination (as you see there more than 2000 pages!).
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
for link in soup.select('name-head a'):
href = 'https://www.drugbank.ca/drugs/' + link.get('href')
pages_data(href)
def pages_data(item_url):
r = requests.get(item_url)
soup = BeautifulSoup(r.text, "lxml")
g_data = soup.select('div.content-container')
for item in g_data:
print item.contents[1].text
print item.contents[3].findAll('td')[1].text
try:
print item.contents[5].findAll('td',{'class':'col-md-2 col-sm-4'})
[0].text
except:
pass
print item_url
drug_data()
How can I scrape all of the data and handle pagination properly?
This page uses almost the same url for all pages so you can use for loop to generate them
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
for x in range(1, 2001):
drug_data(x)
Or using while and try/except to get more then 2000 pages
def drug_data(page_number):
url = 'https://www.drugbank.ca/drugs/?page=' + str(page_number)
#... rest ...
# --- later ---
page = 0
while True:
try:
page += 1
drug_data(page)
except Exception as ex:
print(ex)
print("probably last page:", page)
break # exit `while` loop
You can also find url to next page in HTML
<a rel="next" class="page-link" href="/drugs?approved=1&c=name&d=up&page=2">›</a>
so you can use BeautifulSoup to get this link and use it.
It displays current url, finds link to next page (using class="page-link" rel="next") and loads it
import requests
from bs4 import BeautifulSoup
def drug_data():
url = 'https://www.drugbank.ca/drugs/'
while url:
print(url)
r = requests.get(url)
soup = BeautifulSoup(r.text ,"lxml")
#data = soup.select('name-head a')
#for link in data:
# href = 'https://www.drugbank.ca/drugs/' + link.get('href')
# pages_data(href)
# next page url
url = soup.findAll('a', {'class': 'page-link', 'rel': 'next'})
print(url)
if url:
url = 'https://www.drugbank.ca' + url[0].get('href')
else:
break
drug_data()
BTW: never use except:pass because you can have error which you didn't expect and you will not know why it doesn't work. Better display error
except Exception as ex:
print('Error:', ex)

Categories