I'm trying to get leads from yelp using python and beautifulsoup but I'm not able to catch the fields for phone name address and wesbite (optional).
I'm getting the following error here is my code I try to search and found different solution but they didn't work for me.
Here is my code
from bs4 import BeautifulSoup
import requests
import sys
import csv
import requests, re, json
## Get the min and max page numbers
pagenum=0
maxpage =0
## loop go thourgh the pages
while pagenum <= maxpage:
newsu =pagenum
newsu = str(newsu)
csvname = 'cardealers'+newsu+'.csv';
csvfile = open(csvname , 'w',encoding="utf-8")
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['Business name', 'phone' , 'address'] )
headers = {'User-Agent':'Mozilla/5.0'}
r = requests.get('https://www.yelp.com/search?find_desc=Used%20Car%20Dealers&find_loc=New%20York%2C%20NY&ns=1&sortby=review_count&start={}'.format(pagenum), headers = headers)
p = re.compile(r'PRELOADED_STATE__ = (.*?);')
data = json.loads(p)
print(data)
pagenum =pagenum+1
for item in data['searchResult']['results']:
name = item['businessName']
phone=item['phone']
address= ([item['address'],item['city'], item['state'], item['postalcode']])
csv_writer.writerow([name, phone , address ])
print(name)
csvfile.close()
here is the error message.
Traceback (most recent call last): File
"\Python\Python36\scraper\scrape.py", line 22, in
data = json.loads(p) File "\Python\Python36\lib\json__init__.py", line 348, in loads
'not {!r}'.format(s.class.name)) TypeError: the JSON object must be str, bytes or bytearray, not 'SRE_Pattern'
you are trying to read in a string that is not json format.
Essentially, this is what you are doing:
data = json.loads('THIS IS JUST A STRING. NOT IN A JSON FORMAT')
so you want to do something like: data = json.loads(p.findall(r.text))
You actually need to pull that out from the html. The other MAJOR issue though is that is not even within the html you are pulling...so it will always return an empty list.
Also, you are not iterating through anything. You start at pagenum=0, with maxpage page=0 and run while pagenum<=maxpage which means it's going to run forever.
The json structure with the data is in the html, but looks like it's within the Comments. So you'll need to parse that instead.
Also, why do:
newsu =pagenum
newsu = str(newsu)
simply do newsu = str(pagenum). Do you really want a seperate file for each iteration? I just put it into 1 file:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import math
## Get the min and max page numbers
pagenum=0
results = pd.DataFrame()
with requests.Session() as s:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'}
url = 'https://www.yelp.com/search?find_desc=Used%20Car%20Dealers&find_loc=New%20York%2C%20NY&ns=1&sortby=review_count&start={}'.format(pagenum)
r = s.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if '<!--{' in script.text:
jsonStr = script.text.split('<!--')[-1].split('-->')[0]
jsonData = json.loads(jsonStr)
totalPages = jsonData['searchPageProps']['searchResultsProps']['paginationInfo']['totalResults']
resultsPerPage = jsonData['searchPageProps']['searchResultsProps']['paginationInfo']['resultsPerPage']
totalPages = math.ceil(totalPages/resultsPerPage)
## loop go through the pages
for pagenum in range(0,totalPages+1):
url = 'https://www.yelp.com/search?find_desc=Used%20Car%20Dealers&find_loc=New%20York%2C%20NY&ns=1&sortby=review_count&start={}'.format(pagenum)
r = s.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if '<!--{' in script.text:
jsonStr = script.text.split('<!--')[-1].split('-->')[0]
jsonData = json.loads(jsonStr)
for each in jsonData['searchPageProps']['searchResultsProps']['searchResults']:
if 'searchResultBusiness' in each.keys():
busiName = each['searchResultBusiness']['name']
phone = each['searchResultBusiness']['phone']
address = each['searchResultBusiness']['formattedAddress']
temp_df = pd.DataFrame([[busiName, phone, address]], columns=['Business name', 'phone' , 'address'])
results = results.append(temp_df, sort=False).reset_index(drop=True)
print ('Aquired page: %s' %pagenum)
results.to_csv('cardealers.csv', index=False)
Related
Attempting to update my script so that it searches through not only the url provided but all of the pages in range (1-3) and adds them to the CSV. Can anyone spot why my current code would not be working? The addition to pages following 1 are in the following format: page-2
from bs4 import BeautifulSoup
import requests
from csv import writer
from random import randint
from time import sleep
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
for page in range(1, 4):
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for list in lists:
title = list.find('h2').text
price = list.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
sleep(randint(2,10))
You are overwrite req multiple times and end up only analyzing the results of page 2. Put everything inside your loop.
edit: Also the upper limit in range() is not included, so you probably want to do for page in range(1, 4): to get the first three pages.
edit full example:
from bs4 import BeautifulSoup
import requests
from csv import writer
url = "https://www.propertypal.com/property-for-sale/ballymena-area/page-"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
with open('ballymena.csv', 'w', encoding='utf8', newline='') as f:
thewriter = writer(f)
header = ['Address', 'Price']
thewriter.writerow(header)
for page in range(1, 4):
req = requests.get(f"{url}{page}", headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
for li in soup.find_all('li', class_="pp-property-box"):
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
info = [title, price]
thewriter.writerow(info)
The solution from bitflip is fine, however a few things I'll point out to help you.
try to avoid variable names that are predefined functions in python. For example list being one of those.
while csv writer is a fine package to use, also consider using pandas. You will likely further down the road need to do some data manipulation and what not, so might as well familiarise yourself with the package now. It's a very powerful tool.
Here's how I would have coded it.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from random import randint
from time import sleep
from os.path import exists
#example of second page url: https://www.propertypal.com/property-for-sale/ballymena-area/page-2
url= "https://www.propertypal.com/property-for-sale/ballymena-area/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
# Check if csv file exists
file_exists = exists('ballymena.csv')
for page in range(1, 4):
rows = []
req = requests.get(url + 'page-' + str(page), headers=headers)
# print(page)
soup = BeautifulSoup(req.content, 'html.parser')
lists = soup.find_all('li', class_="pp-property-box")
for li in lists:
title = li.find('h2').text
price = li.find('p', class_="pp-property-price").text
row = {
'Address':title,
'Price':price}
rows.append(row)
df = pd.DataFrame(rows)
# If file doesnt exists, write initial file
if not file_exists:
df.to_csv('ballymena.csv', index=False)
file_exists = True
# If it already exists, ammend to file
else:
df.to_csv('ballymena.csv', mode = 'a', header = False, index = False)
sleep(randint(2,10))
I'd like to crawl through Tesla's list of superchargers and open each individual page to record the number of connectors and charging rates. This is one of my first programs so I'm sure I'm doing a few things wrong, but I can't get past the HTTP Error 403 when I use urlopen to open multiple urls. Any help would be greatly appreciated!
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import csv
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
url = 'https://www.tesla.com/findus/list/superchargers/United%20States'
html = urlopen(url, context=ctx).read()
soup_main = BeautifulSoup(html, "html.parser")
data = []
for tag in soup_main('a'):
if '/findus/location/supercharger/' in tag.get('href',None):
url_sc = 'https://www.tesla.com' + tag['href']
html_sc = urlopen(url_sc, context=ctx).read()
soup_sc = BeautifulSoup(html_sc, "html.parser")
address = soup_sc.find('span', class_='street-address').string
city = soup_sc.find('span', class_='locality').string[:-5]
state = soup_sc.find('span', class_='locality').string[-3:]
details = soup_sc.find_all('p')[1].contents[-1]
data.append([address, city, state, details])
header = ['Address', 'City', 'State', 'Details']
with open('datapull.csv', 'w') as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerow(header)
for row in data:
writer.writerow(row)
try to add headers to fake a browser:
import urllib.request
# Request with Header Data to send User-Agent header
url_sc = 'https://www.journaldev.com'
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17'
request = urllib.request.Request(url_sc, headers=headers)
resp = urllib.request.urlopen(request)
source https://www.journaldev.com/20795/python-urllib-python-3-urllib#python-urllib-request-with-header
New to screen scraping here and this is my first time posting on stackoverflow. Aplogies in advance for any formatting errors in this post. Attempting to extract data from multiple pages with URL:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
For instance, page 1 is:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-1
Page 2:
https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-2
and so on...
My script is running without errors. However, my Pandas exported csv only contains 1 row with the first extracted value. At the time of this posting, the first value is:
14.01 Acres  Vestaburg, Montcalm County, MI$275,000
My intent is to create a spreadsheet with hundreds of rows that pull the property description from the URLs.
Here is my code:
import requests
from requests import get
from bs4 import BeautifulSoup
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}
)
n_pages = 0
desc = []
for page in range(1,900):
n_pages += 1
sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
r=get(sapo_url, headers=headers)
page_html = BeautifulSoup(r.text, 'html.parser')
house_containers = page_html.find_all('div', class_="propName")
if house_containers != []:
for container in house_containers:
desc = container.getText(strip=True)
else:
break
print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))
import pandas as pd
df = pd.DataFrame({'description': [desc]})
df.to_csv('test4.csv', encoding = 'utf-8')
I suspect the problem is with the line reading desc = container.getText(strip=True) and have tried changing the line but keep getting errors when running.
Any help is appreciated.
I believe the mistake is in the line:
desc = container.getText(strip=True)
Every time it loops, the value in desc is replaced, not added on. To add items into the list, do:
desc.append(container.getText(strip=True))
Also, since it is already a list, you can remove the brackets from the DataFrame creation like so:
df = pd.DataFrame({'description': desc})
The cause is that no data is being added in the loop, so only the final data is being saved. For testing purposes, this code is now on page 2, so please fix it.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}
)
n_pages = 0
desc = []
all_data = pd.DataFrame(index=[], columns=['description'])
for page in range(1,3):
n_pages += 1
sapo_url = 'https://www.landwatch.com/Michigan_land_for_sale/West_Central_Region/Page-' + str(page)
r=get(sapo_url, headers=headers)
page_html = BeautifulSoup(r.text, 'html.parser')
house_containers = page_html.find_all('div', class_="propName")
if house_containers != []:
for container in house_containers:
desc = container.getText(strip=True)
df = pd.DataFrame({'description': [desc]})
all_data = pd.concat([all_data, df], ignore_index=True)
else:
break
all_data.to_csv('test4.csv', encoding = 'utf-8')
print('you scraped {} pages containing {} Properties'.format(n_pages, len(desc)))
A year ago I learned some python in one of my classes but haven't had to use much since then so this may or may not be a simple question.
I'm trying to web-scrape the top grossing films of all time table from Box Office Mojo and I want to grab the rank, title, and gross for the top 10 films in the 2010s. I've been playing around in python and I can get the entire table into python but I don't know how to manipulate it from there, let alone write out a csv file. Any guidance/tips?
Here is what will print the entire table for me (the first few lines are copied from an old web-scraping assignment to get me started):
import bs4
import requests
from bs4 import BeautifulSoup as soup
url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
page_html = requests.get(url, headers=headers)
page_soup = soup(page_html.text, "html.parser")
boxofficemojo_table = page_soup.find("div", {"class": "a-section imdb-scroll-table-inner"})
complete_table = boxofficemojo_table.get_text()
print(complete_table)`
You Can use pd.read_html for this.
import pandas as pd
Data = pd.read_html(r'https://www.boxofficemojo.com/chart/top_lifetime_gross/')
for data in Data:
data.to_csv('Data.csv', ',')
2.Using Bs4
import pandas as pd
from bs4 import BeautifulSoup
import requests
URL = r'https://www.boxofficemojo.com/chart/top_lifetime_gross/'
print('\n>> Exctracting Data using Beautiful Soup for :'+ URL)
try:
res = requests.get(URL)
except Exception as e:
print(repr(e))
print('\n<> URL present status Code = ',(res.status_code))
soup = BeautifulSoup(res.text,"lxml")
table = soup.find('table')
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll(["td"]):
text = cell.text
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
for item in list_of_rows:
' '.join(item)
Data = pd.DataFrame(list_of_rows)
Data.dropna(axis = 0, how = 'all',inplace = True)
print(Data.head(10))
Data.to_csv('Table.csv')
I am trying to scrape this website and trying to get the reviews but I am facing an issue,
The page loads only 50 reviews.
To load more you have to click "Show More Reviews" and I don't know how to get all the data as there is no page link, also "Show more Reviews" doesn't have a URL to explore, the address remains the same.
url =
"https://www.capterra.com/p/134048/HiMama-Preschool-Child-Care-App/#reviews"
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
a = []
url = requests.get(url)
html = url.text
soup = BeautifulSoup(html, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
#print(table)
for x in table:
a.append(x.text)
df = pd.DataFrame(a)
df.to_csv("review.csv", sep='\t')
I know this is not pretty code but I am just trying to get the review text first.
kindly help. As I am little new to this.
Looking at the website, the "Show more reviews" button makes an ajax call and returns the additional info, all you have to do is find it's link and send a get request to it (which I've done with some simple regex):
import requests
import re
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) snap Chromium/74.0.3729.169 Chrome/74.0.3729.169 Safari/537.36"
}
url = "https://www.capterra.com/p/134048/HiMama-Preschool-Child-Care-App/#reviews"
Data = []
#Each page equivalant to 50 comments:
MaximumCommentPages = 3
with requests.Session() as session:
info = session.get(url)
#Get product ID, needed for getting more comments
productID = re.search(r'"product_id":(\w*)', info.text).group(1)
#Extract info from main data
soup = BeautifulSoup(info.content, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
for x in table:
Data.append(x)
#Number of pages to get:
#Get additional data:
params = {
"page": "",
"product_id": productID
}
while(MaximumCommentPages > 1): # number 1 because one of them was the main page data which we already extracted!
MaximumCommentPages -= 1
params["page"] = str(MaximumCommentPages)
additionalInfo = session.get("https://www.capterra.com/gdm_reviews", params=params)
print(additionalInfo.url)
#print(additionalInfo.text)
#Extract info for additional info:
soup = BeautifulSoup(additionalInfo.content, "html.parser")
table = soup.findAll("div", {"class":"review-comments"})
for x in table:
Data.append(x)
#Extract data the old fashioned way:
counter = 1
with open('review.csv', 'w') as f:
for one in Data:
f.write(str(counter))
f.write(one.text)
f.write('\n')
counter += 1
Notice how I'm using a session to preserve cookies for the ajax call.
Edit 1: You can reload the webpage multiple times and call the ajax again to get even more data.
Edit 2: Save data using your own method.
Edit 3: Changed some stuff, now gets any number of pages for you, saves to file with good' ol open()