I am having some trouble with web scraping using beautifulsoup - python

when ever i try to extract text between tags using .text() it gives a blank screen with just [] as output
import requests
from bs4 import BeautifulSoup
page = requests.get("https://www.amazon.in/s?k=ssd&ref=nb_sb_noss")
soup = BeautifulSoup(page.content, "html.parser")
product = soup.find_all("h2",class_="a-link-normal a-text-normal")
results = soup.find_all("span",class_="a-offscreen")
print(product)
this is the output that i got :
C:\Users\Kushal\Desktop\requests-tutorial>C:/Users/Kushal/AppData/Local/Programs/Python/Python37/python.exe c:/Users/Kushal/Desktop/requests-tutorial/scraper.py
[]
when i try listing everything with a for loop then, nothing shows up not even the empty square brackets

Based on your comment below. I've modified the code to fetch all the product title on the said page along with the price details.
Mark as answer if it works, else comment for further analysis.
import requests
from bs4 import BeautifulSoup
import lxml
dataList = list()
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-charset": "cp1254,ISO-8859-9,utf-8;q=0.7,*;q=0.3",
"accept-encoding": "gzip,deflate,sdch",
"accept-language": "tr,tr-TR,en-US,en;q=0.8",
}
url = requests.get('https://www.amazon.in/s?k=ssd&ref=nb_sb_noss'.format(), headers=headers)
soup = BeautifulSoup(url.content, 'lxml')
title = soup.find_all('span', attrs={'class':'a-size-medium a-color-base a-text-normal'})
price = soup.find_all('span', attrs={'class':'a-offscreen'})
for product in zip(title,price):
title,price=product
title_proper=title.text.strip()
price_proper=price.text.strip()
print(title_proper,'-',price_proper)

Related

The result of parsing petshop.ru is not displayed

I'm trying to parse the page https://www.petshop.ru/catalog/cats/veterinary_feed/dlya_koshek_pri_zapore_fibre_response_fr31_5789/, but it doesn't work.
import requests
from bs4 import BeautifulSoup as BS
r = requests.get ("https://www.petshop.ru/catalog/cats/veterinary_feed/dlya_koshek_pri_zapore_fibre_response_fr31_5789/")
html = BS (r.content, 'html.parser')
for el in html.select (".style_product_head__ufClP > .style_tablet__bK5he style_desktop__3Zkvu"):
title = el.select ('.style_list__3V0_P > .style_price_wrapper__1HT8P')
print ( title[0].text )
I do according to the model, because unfamiliar with python:
import requests
from bs4 import BeautifulSoup as BS
r = requests.get ("https://stopgame.ru/review/new/izumitelno/p1")
html = BS (r.content, 'html.parser')
for el in html.select (".items > .article-summary "):
title = el.select ('.caption > a')
print ( title[0].text )
I expect to see the following result: Обычная цена
Ideally, it would also be interesting to know how to display a result of this kind: petshop.ru: Обычная цена 3 125 ₽ , because I plan to implement the parsing of several more sites to track prices for this cat food :)
All the data you want is in the HTML source and it comes as a JSON object in a <script> tag. You can easily target that and parse it.
Here's how:
import json
import re
import requests
url = "https://www.petshop.ru/catalog/cats/veterinary_feed/dlya_koshek_pri_zapore_fibre_response_fr31_5789/"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:99.0) Gecko/20100101 Firefox/99.0",
}
data = (
json.loads(
re.search(
r'"onProductView",\s(.*)\);',
requests.get(url, headers=headers).text
).group(1)
)
)
print(f"{data['product']['name']} - {data['product']['price']} руб.")
Output:
Для кошек при запоре - 3125 руб.

Finding a Field in an parsed HTML content

I was trying to get the table from https://www.innovatoretfs.com/etf/default.aspx?ticker=kjul
I just want the top-right table
I wrote the code in python and was able to get till the parsed HTML content and got the contents.
import requests
from bs4 import BeautifulSoup
url = "https://www.innovatoretfs.com/etf/default.aspx?ticker=kjul"
r = requests.get(url, headers = headers)
soup = BeautifulSoup(r.content, "html.parser")
table_it=soup.find_all(class_="al_fund")
value_it=soup.find_all(class_="ar_fund")
print(table_it)
print(value_it)
But I am not able to get the final values in a table coz of the HTML formal. can some one please help?
Another approach would be to grab all the values for given divs and then zip them all together.
Here's how:
import requests
from bs4 import BeautifulSoup
def extractor(s: BeautifulSoup, tag: str, cls: str) -> list:
return [i.getText() for i in s.find_all(tag, class_=cls)]
url = "https://www.innovatoretfs.com/etf/default.aspx?ticker=kjul"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
row_names = extractor(soup, "div", "al_fund")
row_values = extractor(soup, "div", "ar_fund")
for k, v in list(zip(row_names, row_values))[:6]:
print(f"{k}\t{v}")
Output:
Ticker KJUL
Listing Date 7/1/2020
Number of Holdings 7
Expense Ratio 0.79%
Intraday NAV KJUL.IV
Exchange Cboe BZX
Quick and easy way to get the desired content is like the following:
import requests
from bs4 import BeautifulSoup
link = 'https://www.innovatoretfs.com/etf/default.aspx?ticker=kjul'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
r = s.get(link)
soup = BeautifulSoup(r.text,"lxml")
for item in soup.select(".noshadecol")[:6]:
title = item.select_one(".al_fund").text
value = item.select_one(".ar_fund").text
print(title,value)
Output:
Ticker KJUL
Listing Date 7/1/2020
Number of Holdings 7
Expense Ratio 0.79%
Intraday NAV KJUL.IV
Exchange Cboe BZX
with respect to the structure of HTML you did right to extract al_fund and ar_fund. You just need to add this code to print the table :
for i in range(6):
print(f'{table_it[i].text} : {value_it[i].text}')
Ticker : KJUL
Listing Date : 7/1/2020
Number of Holdings : 7
Expense Ratio : 0.79%
Intraday NAV : KJUL.IV
Exchange : Cboe BZX

returning broken href link when scraping with python

I am trying to scrape IMDB https://www.imdb.com/chart/top/?ref_=nv_mv_250. I want to write a loop to enter each film page by getting all the href attributes. However, the html code returned by urlopen shows broken href attributes (ignoring everythin after the question mark). Here are my code and result. Thank you so much in advance.
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'
html = urlopen(url)
bs = BeautifulSoup(html.read(),'html.parser')
table = bs.find('tbody',{'class':'lister-list'})
rows = table.find_all('tr')
for row in rows:
link = row.find('td',{'class':'titleColumn'}).find('a')['href']
print(link)
The result I get is something like this (ignoring everythin after the question mark)
/usr/local/Caskroom/miniconda/base/envs/web_scraping/bin/python /Users/gracezhou/Python/Python-Projects/scraping/imdb/test.py
/title/tt0111161/
/title/tt0068646/
/title/tt0071562/
/title/tt0468569/
/title/tt0050083/
/title/tt0108052/
/title/tt0167260/
/title/tt0110912/
/title/tt0060196/
/title/tt0120737/
/title/tt0137523/
/title/tt0109830/
I wan to receive somethins like this:
/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=CPK54FS6SPX9EDAPBSJT&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1
Send the request using headers with User-Agent in it.
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'}
res = requests.get("https://www.imdb.com/chart/top/?ref_=nv_mv_250", headers=headers)
soup = BeautifulSoup(res.content,"html.parser")
table = soup.find('tbody',{'class':'lister-list'})
rows = table.find_all('tr')
for row in rows:
link = row.find('td',{'class':'titleColumn'}).find('a')['href']
print(link)
Output:
/title/tt0111161/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_1
/title/tt0068646/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_2
/title/tt0071562/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_3
/title/tt0468569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_4
/title/tt0050083/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_5
/title/tt0108052/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_6
/title/tt0167260/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_7
/title/tt0110912/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_8
/title/tt0060196/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_9
/title/tt0120737/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_10
/title/tt0137523/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_11
/title/tt0109830/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_12
/title/tt1375666/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_13
/title/tt0080684/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_14
/title/tt0167261/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_15
/title/tt0133093/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_16
/title/tt0099685/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_17
/title/tt0073486/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_18
/title/tt0047478/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_19
/title/tt0114369/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_20
/title/tt0118799/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_21
/title/tt0317248/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_22
/title/tt0102926/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_23
/title/tt8503618/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_24
/title/tt0038650/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_25
/title/tt0076759/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_26
/title/tt0120815/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_27
/title/tt0245429/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_28
/title/tt6751668/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_29
/title/tt0120689/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_30
/title/tt0816692/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_31
/title/tt0110413/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_32
/title/tt0114814/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_33
/title/tt0056058/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_34
/title/tt0110357/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_35
/title/tt0088763/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_36
/title/tt0253474/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_37
/title/tt0103064/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_38
/title/tt0120586/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_39
/title/tt0027977/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_40
/title/tt0054215/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_41
/title/tt0172495/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_42
/title/tt0021749/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_43
/title/tt0407887/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_44
/title/tt1675434/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_45
/title/tt2582802/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_46
/title/tt0482571/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_47
/title/tt0095327/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_48
/title/tt0064116/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_49
/title/tt0034583/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_50
/title/tt0095765/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_51
/title/tt0047396/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_52
/title/tt0078748/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_53
/title/tt0078788/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_54
/title/tt0209144/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_55
/title/tt0082971/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_56
/title/tt0032553/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_57
/title/tt7286456/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_58
/title/tt0405094/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_59
/title/tt1853728/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_60
/title/tt0050825/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_61
/title/tt0081505/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_62
/title/tt0910970/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_63
/title/tt4154756/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_64
/title/tt0043014/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_65
/title/tt4633694/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_66
/title/tt0119698/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_67
/title/tt0057012/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_68
/title/tt0051201/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_69
/title/tt0364569/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_70
/title/tt1345836/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_71
/title/tt0087843/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_72
/title/tt4154796/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_73
/title/tt0090605/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_74
/title/tt5311514/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_75
/title/tt2380307/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_76
/title/tt0169547/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_77
/title/tt0112573/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_78
/title/tt1187043/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_79
/title/tt0082096/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_80
/title/tt0114709/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_81
/title/tt0057565/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_82
/title/tt0086879/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_83
/title/tt0986264/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_84
/title/tt0086190/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_85
/title/tt0361748/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_86
/title/tt0105236/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_87
/title/tt0119217/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_88
/title/tt8267604/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_89
/title/tt0062622/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_90
/title/tt0180093/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_91
/title/tt8579674/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_92
/title/tt0052357/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_93
/title/tt0022100/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_94
/title/tt5074352/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_95
/title/tt0338013/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_96
/title/tt2106476/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_97
/title/tt0033467/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_98
/title/tt0093058/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_99
/title/tt0040522/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_100
/title/tt0066921/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_101
/title/tt0053125/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_102
/title/tt0012349/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_103
/title/tt0208092/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_104
/title/tt0045152/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_105
/title/tt0086250/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_106
/title/tt0075314/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_107
/title/tt0211915/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_108
/title/tt0056172/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=e31d89dd-322d-4646-8962-327b42fe94b1&pf_rd_r=YMF3CFY4HGXT0TT942CP&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_tt_109
...
...
...

Scraping through every product on retailer website

We are trying to scrape every product for every category on Forever 21's website. Given a product page, we know how to extract the information we need, and given a category, we can extract every product. However, we do not know how to crawl through every product category. Here is our code for a given category and getting every product:
import requests
from bs4 import BeautifulSoup
import json
#import re
params = {"action": "getcategory",
"br": "f21",
#"category": re.compile('\S+'),
"category": "dress",
"pageno": 1,
"pagesize": "",
"sort": "",
"fsize": "",
"fcolor": "",
"fprice": "",
"fattr": ""}
url = "http://www.forever21.com/Ajax/Ajax_Category.aspx"
js = requests.get(url, params=params).json()
soup = BeautifulSoup(js[u'CategoryHTML'], "html.parser")
i = 0
j = 0
while len(soup.select("div.item_pic a")) != 0:
for a in soup.select("div.item_pic a"):
#print a["href"]
i = i + 1
params["pageno"] = params["pageno"] + 1
j = j + 1
js = requests.get(url, params=params).json()
soup = BeautifulSoup(js[u'CategoryHTML'], "html.parser")
print i
print j
As you can see in the comments, we tried to use regular expressions for the category but had no success. i and j are just product and page counters. Any suggestions on how to modify/add to this code to get every product category?
You can scrape the category page and get all subcategories from the navigation menu:
import requests
from bs4 import BeautifulSoup
url = "http://www.forever21.com/Product/Category.aspx?br=f21&category=app-main"
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36"})
soup = BeautifulSoup(response.content, "html.parser")
menues = [li["class"][0] for li in soup.select("#has_sub .white nav ul > li")]
print(menues)
Prints:
[u'women-new-arrivals', u'want_list', u'dress', u'top_blouses', u'outerwear_coats-and-jackets', u'bottoms', u'intimates_loungewear', u'activewear', u'swimwear_all', u'acc', u'shoes', u'branded-shop-women-clothing', u'sale_women|women', u'women-new-arrivals-clothing-dresses', u'women-new-arrivals-clothing-tops', u'women-new-arrivals-clothing-outerwear', u'women-new-arrivals-clothing-bottoms', u'women-new-arrivals-clothing-intimates-loungewear', u'women-new-arrivals-clothing-swimwear', u'women-new-arrivals-clothing-activewear', u'women-new-arrivals-accessories|women-new-arrivals', u'women-new-arrivals-shoes|women-new-arrivals', u'promo-web-exclusives', u'promo-best-sellers-app', u'backinstock-women', u'promo-shop-by-outfit-women', u'occasion-shop-wedding', u'contemporary-main', u'promo-basics', u'21_items', u'promo-summer-forever', u'promo-coming-soon', u'dress_casual', u'dress_romper', u'dress_maxi', u'dress_midi', u'dress_mini', u'occasion-shop-dress', u'top_blouses-off-shoulder', u'top_blouses-lace-up', u'top_bodysuits-bustiers', u'top_graphic-tops', u'top_blouses-crop-top', u'top_t-shirts', u'sweater', u'top_blouses-sweatshirts-hoodies', u'top_blouses-shirts', u'top_plaids', u'outerwear_bomber-jackets', u'outerwear_blazers', u'outerwear_leather-suede', u'outerwear_jean-jackets', u'outerwear_lightweight', u'outerwear_utility-jackets', u'outerwear_trench-coats', u'outerwear_faux-fur', u'promo-jeans-refresh|bottoms', u'bottoms_pants', u'bottoms_skirt', u'bottoms_shorts', u'bottoms_shorts-active', u'bottoms_leggings', u'bottoms_sweatpants', u'bottom_jeans|', u'intimates_loungewear-bras', u'intimates_loungewear-panties', u'intimates_loungewear-bodysuits-slips', u'intimates_loungewear-seamless', u'intimates_loungewear-accessories', u'intimates_loungewear-sets', u'activewear_top', u'activewear_sports-bra', u'activewear_bottoms', u'activewear_accessories', u'swimwear_tops', u'swimwear_bottoms', u'swimwear_one-piece', u'swimwear_cover-ups', u'acc_features', u'acc_jewelry', u'acc_handbags', u'acc_glasses', u'acc_hat', u'acc_hair', u'acc_legwear', u'acc_scarf-gloves', u'acc_home-and-gift-items', u'shoes_features', u'shoes_boots', u'shoes_high-heels', u'shoes_sandalsflipflops', u'shoes_wedges', u'shoes_flats', u'shoes_oxfords-loafers', u'shoes_sneakers', u'Shoes_slippers', u'branded-shop-new-arrivals-women', u'branded-shop-women-clothing-dresses', u'branded-shop-women-clothing-tops', u'branded-shop-women-clothing-outerwear', u'branded-shop-women-clothing-bottoms', u'branded-shop-women-clothing-intimates', u'branded-shop-women-accessories|branded-shop-women-clothing', u'branded-shop-women-accessories-jewelry|', u'branded-shop-shoes-women|branded-shop-women-clothing', u'branded-shop-sale-women', u'/brandedshop/brandlist.aspx', u'promo-branded-boho-me', u'promo-branded-rare-london', u'promo-branded-selfie-leslie', u'sale-newly-added', u'sale_dresses', u'sale_tops', u'sale_outerwear', u'sale_sweaters', u'sale_bottoms', u'sale_intimates', u'sale_swimwear', u'sale_activewear', u'sale_acc', u'sale_shoes', u'the-outlet', u'sale-under-5', u'sale-under-10', u'sale-under-15']
Note the values of br and category GET parameters. f21 is the "Women" category, app-main is the main page for a category.

Beautifulsoup parsing error

I am trying to extract some information about an App on Google Play and BeautifulSoup doesn't seem to work.
The link is this(say):
https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts
My code:
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html)
l = soup.find_all("div", { "class" : "document-subtitles"})
print len(l)
0 #How is this 0?! There is clearly a div with that class
I decided to go all in, didn't work either:
i = soup.select('html body.no-focus-outline.sidebar-visible.user-has-no-subscription div#wrapper.wrapper.wrapper-with-footer div#body-content.body-content div.outer-container div.inner-container div.main-content div div.details-wrapper.apps.square-cover.id-track-partial-impression.id-deep-link-item div.details-info div.info-container div.info-box-top')
print i
What am I doing wrong?
You need to pretend to be a real browser by supplying the User-Agent header:
import requests
from bs4 import BeautifulSoup
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})
html = r.content
soup = BeautifulSoup(html, "html.parser")
title = soup.find(class_="id-app-title").get_text()
rating = soup.select_one(".document-subtitle .star-rating-non-editable-container")["aria-label"].strip()
print(title)
print(rating)
Prints the title and the current rating:
Weird Facts
Rated 4.3 stars out of five stars
To get the additional information field values, you can use the following generic function:
def get_info(soup, text):
return soup.find("div", class_="title", text=lambda t: t and t.strip() == text).\
find_next_sibling("div", class_="content").get_text(strip=True)
Then, if you do:
print(get_info(soup, "Size"))
print(get_info(soup, "Developer"))
You will see printed:
1.4M
Email email#here.com

Categories