how to scrape page inside the result card using Bs4? - python

<img class="no-img" data-src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium" alt="Biryani By Kilo" data-gatype="RestaurantImageClick" data-url="/delhi/biryani-by-kilo-connaught-place-central-delhi-40178" data-w-onclick="cardClickHandler" src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium">
page url - https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p=1
this page contains some restaurants card now while scrapping the page in the loop I want to go inside the restaurant card URL which is in the above HTML code name by data-url class and scrape the no. of reviews from inside it, I don't know how to do it my current code for normal front page scrapping is ;
def extract(page):
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = requests.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup): # function to scrape the page
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
title = item.find('a').text.strip() # restaurant name
loc = item.find('div', class_ = 'restnt-loc ellipsis').text.strip() # restaurant location
try: # used this try and except method because some restaurants are unrated and while scrpaping those we would run into an error
rating = item.find('div', class_="img-wrap").text
rating = (re.sub("[^0-9,.]", "", rating))
except:
rating = None
pricce = item.find('span', class_="double-line-ellipsis").text.strip() # price for biriyani
price = re.sub("[^0-9]", "", pricce)[:-1]
biry_del = {
'name': title,
'location': loc,
'rating': rating,
'price': price
}
rest_list.append(biry_del)
rest_list = []
for i in range(1,18):
print(f'getting page, {i}')
c = extract(i)
transform(c)
I hope you guys understood please ask in comment for any confusion.

It's not very fast but it looks like you can get all the details you want including the review count (not 232!) if you hit this backend api endpoint:
https://www.dineout.co.in/get_rdp_data_main/delhi/69676/restaurant_detail_main
import requests
from bs4 import BeautifulSoup
import pandas as pd
rest_list = []
for page in range(1,3):
print(f'getting page, {page}')
s = requests.Session()
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = s.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
code = item.find('a')['href'].split('-')[-1] # restaurant code
print(f'Getting details for {code}')
data = s.get(f'https://www.dineout.co.in/get_rdp_data_main/delhi/{code}/restaurant_detail_main').json()
info = data['header']
info.pop('share') #clean up csv
info.pop('options')
rest_list.append(info)
df = pd.DataFrame(rest_list)
df.to_csv('dehli_rest.csv',index=False)

Related

How do I capture the URL of each job so I can open full job description when looking at csv file

Can someone help me modify this script so that it also scraps the URL associated with each job. The purpose would be when browsing the .csv file in a spreadsheet I can click on the link if I would like to know more information about the job. Thank you in advance.
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'}
url= f'https://www.indeed.com/jobs?q=Dispensary&l=Denver%2C+CO&radius={page}'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('\n', '')
job = {
'title': title,
'company': company,
'salary': salary,
'summary': summary
}
joblist.append(job)
return
joblist = []
for i in range(0,90,10):
print(f'Getting page, {i}')
c = extract(0)
transform(c)
df = pd.DataFrame(joblist)
print(df.head())
df.to_csv('jobs.csv')
You can use one of this
url = 'https://www.indeed.com' + item.find('a')['href']
url = 'https://www.indeed.com' + item.find('a').get('href')
url = 'https://www.indeed.com' + item.find('a').attrs['href']
url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
BTW:
You always load the same page. To get next page you have to use start=... in url.
And you can do this more readable using dictionary and params= in requests
payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': page,
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
Working code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extract(start):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
payload = {
'q': 'Dispensary',
'l': 'Denver,+CO',
'radius': 0,
'start': start,
}
url= 'https://www.indeed.com/jobs'
r = requests.get(url, params=payload, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup, joblist):
divs = soup.find_all('div', class_ = 'jobsearch-SerpJobCard')
for item in divs:
title = item.find('a').text.strip()
url = 'https://www.indeed.com' + item.find('a')['href']
#url = 'https://www.indeed.com' + item.find('a').get('href')
#url = 'https://www.indeed.com' + item.find('a').attrs['href']
#url = 'https://www.indeed.com' + item.find('a').attrs.get('href')
company = item.find('span', class_ = 'company').text.strip()
try:
salary = item.find('span', class_ = 'salaryText').text.strip()
except:
salary = ''
summary = item.find('div', class_ = 'summary').text.strip().replace('\n', '')
joblist.append({
'title': title,
'url': url,
'company': company,
'salary': salary,
'summary': summary
})
# --- main ---
joblist = []
for start in range(0, 90, 10):
print('Getting page', start)
c = extract(start)
transform(c, joblist)
df = pd.DataFrame(joblist)
df.to_csv('jobs.csv')
print(df.head())

Unable to scrape the name from the inner page of each result using requests

I've created a script in python making use of post http requests to get the search results from a webpage. To populate the results, it is necessary to click on the fields sequentially shown here. Now a new page will be there and this is how to populate the result.
There are ten results in the first page and the following script can parse the results flawlessly.
What I wish to do now is use the results to reach their inner page in order to parse Sole Proprietorship Name (English) from there.
website address
I've tried so far with:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
payload = {
'QueryString': '0',
'SourceAppCode': 'cambodia-br-soleproprietorships',
'OriginalVersionIdentifier': '',
'_CBASYNCUPDATE_': 'true',
'_CBHTMLFRAG_': 'true',
'_CBNAME_': 'buttonPush'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = res.url.split("&")[0].replace("view.", "update.")
node = re.findall(r"nodeW\d.+?-Advanced",res.text)[0].strip()
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload[node] = 'N'
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[2]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.content, 'html.parser')
for item in soup.find_all("span", class_="appReceiveFocus")[3:]:
print(item.text)
How can I parse the Name (English) from each of the results inner page using requests?
This is one of the ways you can parse the name from the site's inner page and then email address from the address tab. I added this function .get_email() only because I wanted to let you know as to how you can parse content from different tabs.
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
result_url = "https://www.businessregistration.moc.gov.kh/cambodia-master/viewInstance/update.html?id={}"
base_url = "https://www.businessregistration.moc.gov.kh/cambodia-br-soleproprietorships/viewInstance/update.html?id={}"
def get_names(s):
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = result_url.format(res.url.split("id=")[1])
soup = BeautifulSoup(res.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['QueryString'] = 'a'
payload['SourceAppCode'] = 'cambodia-br-soleproprietorships'
payload['_CBNAME_'] = 'buttonPush'
payload['_CBHTMLFRAG_'] = 'true'
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[-1]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
payload.pop('_CBHTMLFRAGNODEID_')
payload.pop('_CBHTMLFRAG_')
payload.pop('_CBHTMLFRAGID_')
for item in soup.select("a[class*='ItemBox-resultLeft-viewMenu']"):
payload['_CBNAME_'] = 'invokeMenuCb'
payload['_CBVALUE_'] = ''
payload['_CBNODE_'] = item['id'].replace('node','')
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
address_url = base_url.format(res.url.split("id=")[1])
node_id = re.findall(r"taba(.*)_",soup.select_one("a[aria-label='Addresses']")['id'])[0]
payload['_CBNODE_'] = node_id
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNAME_'] = 'tabSelect'
payload['_CBVALUE_'] = '1'
eng_name = soup.select_one(".appCompanyName + .appAttrValue").get_text()
yield from get_email(s,eng_name,address_url,payload)
def get_email(s,eng_name,url,payload):
res = s.post(url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
email = soup.select_one(".EntityEmailAddresses:contains('Email') .appAttrValue").get_text()
yield eng_name,email
if __name__ == '__main__':
with requests.Session() as s:
for item in get_names(s):
print(item)
Output are like:
('AMY GEMS', 'amy.n.company#gmail.com')
('AHARATHAN LIN LIANJIN FOOD FLAVOR', 'skykoko344#gmail.com')
('AMETHYST DIAMOND KTV', 'twobrotherktv#gmail.com')
To get the Name (English) you can simply replace print(item.text) with print(item.text.split('/')[1].split('(')[0].strip()) which prints AMY GEMS

Scrape table fields from html with specific class

So I want to build a simple scraper for google shopping and I encountered some problems.
This is the html text from my request(to https://www.google.es/shopping/product/7541391777504770249/online) where I'm trying to query the highlighted div class sh-osd__total-price inside the div class sh-osd__offer-row :
My code is currently:
from bs4 import BeautifulSoup
from requests import get
url = 'https://www.google.es/shopping/product/7541391777504770249/online'
response = get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
r = html_soup.findAll('tr', {'class': 'sh-osd__offer-row'}) #Returns empty
print(r)
r = html_soup.findAll('tr', {'class': 'sh-osd__total-price'}) #Returns empty
print(r)
Where both r are empty, beatiful soup doesn't find anything.
Is there any way to find these two div classes with beautiful soup?
You need to add user agent into the headers:
from bs4 import BeautifulSoup
from requests import get
url = 'https://www.google.es/shopping/product/7541391777504770249/online'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'} #<-- added line
response = get(url, headers=headers) #<--- include here
html_soup = BeautifulSoup(response.text, 'html.parser')
r = html_soup.find_all('tr', {'class': 'sh-osd__offer-row'}) #Returns empty
print(r)
r = html_soup.findAll('tr', {'class': 'sh-osd__total-price'}) #Returns empty
print(r)
But, since it's a <table> tag, you can use pandas (it uses beautifulsoup under the hood), but does the hard work for you. It will return a list of all elements that are <table>s as dataframes
import pandas as pd
url = 'https://www.google.es/shopping/product/7541391777504770249/online'
dfs = pd.read_html(url)
print(dfs[-1])
Output:
print(dfs[-1])
Sellers Seller Rating ... Base Price Total Price
0 One Fragance No rating ... £30.95 +£8.76 delivery £39.71
1 eBay No rating ... £46.81 £46.81
2 Carethy.co.uk No rating ... £34.46 +£3.99 delivery £38.45
3 fruugo.co.uk No rating ... £36.95 +£9.30 delivery £46.25
4 cosmeticsmegastore.com/gb No rating ... £36.95 +£9.30 delivery £46.25
5 Perfumes Club UK No rating ... £30.39 +£5.99 delivery £36.38
[6 rows x 5 columns]

Scraping through every product on retailer website

We are trying to scrape every product for every category on Forever 21's website. Given a product page, we know how to extract the information we need, and given a category, we can extract every product. However, we do not know how to crawl through every product category. Here is our code for a given category and getting every product:
import requests
from bs4 import BeautifulSoup
import json
#import re
params = {"action": "getcategory",
"br": "f21",
#"category": re.compile('\S+'),
"category": "dress",
"pageno": 1,
"pagesize": "",
"sort": "",
"fsize": "",
"fcolor": "",
"fprice": "",
"fattr": ""}
url = "http://www.forever21.com/Ajax/Ajax_Category.aspx"
js = requests.get(url, params=params).json()
soup = BeautifulSoup(js[u'CategoryHTML'], "html.parser")
i = 0
j = 0
while len(soup.select("div.item_pic a")) != 0:
for a in soup.select("div.item_pic a"):
#print a["href"]
i = i + 1
params["pageno"] = params["pageno"] + 1
j = j + 1
js = requests.get(url, params=params).json()
soup = BeautifulSoup(js[u'CategoryHTML'], "html.parser")
print i
print j
As you can see in the comments, we tried to use regular expressions for the category but had no success. i and j are just product and page counters. Any suggestions on how to modify/add to this code to get every product category?
You can scrape the category page and get all subcategories from the navigation menu:
import requests
from bs4 import BeautifulSoup
url = "http://www.forever21.com/Product/Category.aspx?br=f21&category=app-main"
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36"})
soup = BeautifulSoup(response.content, "html.parser")
menues = [li["class"][0] for li in soup.select("#has_sub .white nav ul > li")]
print(menues)
Prints:
[u'women-new-arrivals', u'want_list', u'dress', u'top_blouses', u'outerwear_coats-and-jackets', u'bottoms', u'intimates_loungewear', u'activewear', u'swimwear_all', u'acc', u'shoes', u'branded-shop-women-clothing', u'sale_women|women', u'women-new-arrivals-clothing-dresses', u'women-new-arrivals-clothing-tops', u'women-new-arrivals-clothing-outerwear', u'women-new-arrivals-clothing-bottoms', u'women-new-arrivals-clothing-intimates-loungewear', u'women-new-arrivals-clothing-swimwear', u'women-new-arrivals-clothing-activewear', u'women-new-arrivals-accessories|women-new-arrivals', u'women-new-arrivals-shoes|women-new-arrivals', u'promo-web-exclusives', u'promo-best-sellers-app', u'backinstock-women', u'promo-shop-by-outfit-women', u'occasion-shop-wedding', u'contemporary-main', u'promo-basics', u'21_items', u'promo-summer-forever', u'promo-coming-soon', u'dress_casual', u'dress_romper', u'dress_maxi', u'dress_midi', u'dress_mini', u'occasion-shop-dress', u'top_blouses-off-shoulder', u'top_blouses-lace-up', u'top_bodysuits-bustiers', u'top_graphic-tops', u'top_blouses-crop-top', u'top_t-shirts', u'sweater', u'top_blouses-sweatshirts-hoodies', u'top_blouses-shirts', u'top_plaids', u'outerwear_bomber-jackets', u'outerwear_blazers', u'outerwear_leather-suede', u'outerwear_jean-jackets', u'outerwear_lightweight', u'outerwear_utility-jackets', u'outerwear_trench-coats', u'outerwear_faux-fur', u'promo-jeans-refresh|bottoms', u'bottoms_pants', u'bottoms_skirt', u'bottoms_shorts', u'bottoms_shorts-active', u'bottoms_leggings', u'bottoms_sweatpants', u'bottom_jeans|', u'intimates_loungewear-bras', u'intimates_loungewear-panties', u'intimates_loungewear-bodysuits-slips', u'intimates_loungewear-seamless', u'intimates_loungewear-accessories', u'intimates_loungewear-sets', u'activewear_top', u'activewear_sports-bra', u'activewear_bottoms', u'activewear_accessories', u'swimwear_tops', u'swimwear_bottoms', u'swimwear_one-piece', u'swimwear_cover-ups', u'acc_features', u'acc_jewelry', u'acc_handbags', u'acc_glasses', u'acc_hat', u'acc_hair', u'acc_legwear', u'acc_scarf-gloves', u'acc_home-and-gift-items', u'shoes_features', u'shoes_boots', u'shoes_high-heels', u'shoes_sandalsflipflops', u'shoes_wedges', u'shoes_flats', u'shoes_oxfords-loafers', u'shoes_sneakers', u'Shoes_slippers', u'branded-shop-new-arrivals-women', u'branded-shop-women-clothing-dresses', u'branded-shop-women-clothing-tops', u'branded-shop-women-clothing-outerwear', u'branded-shop-women-clothing-bottoms', u'branded-shop-women-clothing-intimates', u'branded-shop-women-accessories|branded-shop-women-clothing', u'branded-shop-women-accessories-jewelry|', u'branded-shop-shoes-women|branded-shop-women-clothing', u'branded-shop-sale-women', u'/brandedshop/brandlist.aspx', u'promo-branded-boho-me', u'promo-branded-rare-london', u'promo-branded-selfie-leslie', u'sale-newly-added', u'sale_dresses', u'sale_tops', u'sale_outerwear', u'sale_sweaters', u'sale_bottoms', u'sale_intimates', u'sale_swimwear', u'sale_activewear', u'sale_acc', u'sale_shoes', u'the-outlet', u'sale-under-5', u'sale-under-10', u'sale-under-15']
Note the values of br and category GET parameters. f21 is the "Women" category, app-main is the main page for a category.

Beautifulsoup parsing error

I am trying to extract some information about an App on Google Play and BeautifulSoup doesn't seem to work.
The link is this(say):
https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts
My code:
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html)
l = soup.find_all("div", { "class" : "document-subtitles"})
print len(l)
0 #How is this 0?! There is clearly a div with that class
I decided to go all in, didn't work either:
i = soup.select('html body.no-focus-outline.sidebar-visible.user-has-no-subscription div#wrapper.wrapper.wrapper-with-footer div#body-content.body-content div.outer-container div.inner-container div.main-content div div.details-wrapper.apps.square-cover.id-track-partial-impression.id-deep-link-item div.details-info div.info-container div.info-box-top')
print i
What am I doing wrong?
You need to pretend to be a real browser by supplying the User-Agent header:
import requests
from bs4 import BeautifulSoup
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})
html = r.content
soup = BeautifulSoup(html, "html.parser")
title = soup.find(class_="id-app-title").get_text()
rating = soup.select_one(".document-subtitle .star-rating-non-editable-container")["aria-label"].strip()
print(title)
print(rating)
Prints the title and the current rating:
Weird Facts
Rated 4.3 stars out of five stars
To get the additional information field values, you can use the following generic function:
def get_info(soup, text):
return soup.find("div", class_="title", text=lambda t: t and t.strip() == text).\
find_next_sibling("div", class_="content").get_text(strip=True)
Then, if you do:
print(get_info(soup, "Size"))
print(get_info(soup, "Developer"))
You will see printed:
1.4M
Email email#here.com

Categories