Extract href from lazy-loading page - python

I am trying to extract the href from the premier league website, however I cannot seem to get all the html links except from the first page:
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.premierleague.com/players/')
soup = BeautifulSoup(r.content, 'lxml')
#get the player index
table = soup.find('div', {'class': 'table playerIndex'})
#<a> is where the href is stored
href_names = [link.get('href') for link in table.findAll('a')]
football_string = 'https://www.premierleague.com'
#concatenate to get the full html link
[football_string + str(x) for x in href_names]
Which only returns the first page - I have tried using selenium however the premier league website has an ad that appears every time it is used that prevents it from working. Any ideas on how to get all the html links?

If I understood your question right, the following approach should do it:
import requests
base = 'https://www.premierleague.com/players/{}/'
link = 'https://footballapi.pulselive.com/football/players'
payload = {
'pageSize': '30',
'compSeasons': '418',
'altIds': 'true',
'page': 0,
'type': 'player',
'id': '-1',
'compSeasonId': '418'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
s.headers['referer'] = 'https://www.premierleague.com/'
s.headers['origin'] = 'https://www.premierleague.com'
while True:
res = s.get(link,params=payload)
if not res.json()['content']:break
for item in res.json()['content']:
print(base.format(int(item['id'])))
payload['page']+=1
Results are like (truncated):
https://www.premierleague.com/players/19970/
https://www.premierleague.com/players/13279/
https://www.premierleague.com/players/13286/
https://www.premierleague.com/players/10905/
https://www.premierleague.com/players/4852/
https://www.premierleague.com/players/4328/
https://www.premierleague.com/players/90665/

Related

Scrape data with <Script type="text/javascript" using beautifulsoup

Im building a web scrape to pull product data from a website, this particular company hides the price behind a "login for Price" banner but the price is hidden in the HTML under <Script type="text/javascript" but im unable to pull it out. the specific link that im testing is https://www.chadwellsupply.com/categories/appliances/Stove-Ranges/hotpoint-24-spacesaver-electric-range---white/
My current code is this and the last line is the one im using to pull the text out.
```
import requests
from bs4 import BeautifulSoup
import pandas as pd
baseurl="https://www.chadwellsupply.com/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
productlinks = []
for x in range (1,3):
response = requests.get(f'https://www.chadwellsupply.com/categories/appliances/Stove-Ranges/?q=&filter=&clearedfilter=undefined&orderby=19&pagesize=24&viewmode=list&currenttab=products&pagenumber={x}&articlepage=')
soup = BeautifulSoup(response.content,'html.parser')
productlist = soup.find_all('div', class_="product-header")
for item in productlist:
for link in item.find_all('a', href = True):
productlinks.append(link['href'])
testlink = 'https://www.chadwellsupply.com/categories/appliances/Stove-Ranges/hotpoint-24-spacesaver-electric-range---white/'
response = requests.get(testlink, headers = headers)
soup = BeautifulSoup(response.content,'html.parser')
print(soup.find('div',class_="product-title").text.strip())
print(soup.find('p',class_="status").text.strip())
print(soup.find('meta',{'property':'og:url'}))
print(soup.find('div',class_="tab-pane fade show active").text.strip())
print(soup.find('div',class_="Chadwell-Shared-Breadcrumbs").text.strip())
print(soup.find('script',{'type':'text/javascript'}).text.strip())
```
Below is the chunk of script from the website (tried to paste directly here but it wouldnt format correctly) that im expecting it to pull but what it gives me is
"window.dataLayer = window.dataLayer || [];"
HTML From website
Ideally id like to just pull the price out but if i can atleast get the whole chunk of data out i can manually extract price.
You can use re/json module to search/parse the HTML data (obviously, beautifulsoup cannot parse JavaScript - another option is to use selenium).
import re
import json
import requests
url = "https://www.chadwellsupply.com/categories/appliances/Stove-Ranges/hotpoint-24-spacesaver-electric-range---white/"
html_doc = requests.get(url).text
data = re.search(r"ga\('ec:addProduct', (.*?)\);", html_doc).group(1)
data = json.loads(data)
print(data)
Prints:
{
"id": "301078",
"name": 'HOTPOINT® 24" SPACESAVER ELECTRIC RANGE - WHITE',
"category": "Stove/ Ranges",
"brand": "Hotpoint",
"price": "759",
}
Then for price you can do:
print(data["price"])
Prints:
759
A hacky alternative to regex is to select for a function in the scripts. In your case, the script contains function(i,s,o,g,r,a,m).
from bs4 import BeautifulSoup
import requests
import json
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
testlink = 'https://www.chadwellsupply.com/categories/appliances/Stove-Ranges/hotpoint-24-spacesaver-electric-range---white/'
response = requests.get(testlink, headers = headers)
soup = BeautifulSoup(response.content,'html.parser')
for el in soup.find_all("script"):
if "function(i,s,o,g,r,a,m)" in el.text:
scripttext = el.text
You can then select the data.
extracted = scripttext.split("{")[-1].split("}")[0]
my_json = json.loads("{%s}" % extracted)
print(my_json)
#{'id': '301078', 'name': 'HOTPOINT® 24" SPACESAVER ELECTRIC RANGE - WHITE', 'category': 'Stove/ Ranges', 'brand': 'Hotpoint', 'price': '759'}
Then get the price.
print(my_json["price"])
#759

python requests for load more data

i want to scrap product images from https://society6.com/art/i-already-want-to-take-a-nap-tomorrow-pink of each product >
step=1 first i go in div', class_='card_card__l44w (which is having each product link)
step=2 then parse the href of each product >
but its getting back only first 15 product link inspite of all 44
==============================
second thing is when i parse each product link and then grab json from there ['product']['response']['product']['data']['attributes']['media_map']
after media_map key there are many other keys like b , c , d , e , f , g (all having src: in it with the image link i only want to parse .jpg image from every key)
below is my code
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
baseurl = 'https://society6.com/'
headers = {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
r = requests.get('https://society6.com/art/flamingo-cone501586', headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
productslist = soup.find_all('div', class_='card_card__l44w')
productlinks = []
for item in productslist:
for link in item.find_all('a', href=True):
productlinks.append(baseurl + link['href'])
newlist = []
for link in productlinks:
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
scripts = soup.find_all('script')[9].text.strip()[24:]
data = json.loads(scripts)
url = data['product']['response']['product']['data']['attributes']['media_map']
detail = {
'links' : url
}
newlist.append(detail)
print('saving')
df = pd.DataFrame(newlist)
df.to_csv('haja.csv')`
[1]: https://i.stack.imgur.com/qdhXP.png
All the information is loaded at first visit and all 66 products are stored in window.__INITIAL_STATE.
If you scroll almost to the end of the file you can see it.
You can use that to parse the information.
import re
import json
data = json.loads((soup
.find("script", text=re.compile("^window.__INITIAL_STATE"))
.text
.replace("</script>", "")
.replace("window.__INITIAL_STATE = ", "")))
products = data["designDetails"]["response"]["designDetails"]["data"]["products"]
products is a list with 66 items. Example:
{'sku': 's6-7120491p92a240v826',
'retail_price': 29,
'discount_price': 24.65,
'image_url': 'https://ctl.s6img.com/society6/img/yF7u4l5D3MODQBBerUQBHdYsfN8/h_264,w_264/acrylic-boxes/small/top/~artwork,fw_1087,fh_1087,fx_-401,fy_-401,iw_1889,ih_1889/s6-original-art-uploads/society6/uploads/misc/f7916751f46d4d9c9fb7f6fe4e5d5729/~~/flamingo-cone501586-acrylic-boxes.jpg',
'product_type': {'id': 92,
'title': 'Acrylic Box',
'slug': 'acrylic-box',
'slug_plural': 'acrylic-boxes'},
'department': {'id': 83,
'title': 'Office'},
'sort': 0}

How do I scrap all movie title, date and reviews on the website below? https://www.nollywoodreinvented.com/list-of-all-reviews

I have tried with the code below and what the code does is to bring the first page and does not load completely the reviews for the movies. I am interested in getting all the movie titles, movie dates, and reviews.
enter code here
from bs4 import BeautifulSoup
import requests
url = 'https://www.nollywoodreinvented.com/list-of-all-reviews'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text, 'lxml')
movie_div = soup.find_all('div', class_='article-panel')
title=[]
for div in movie_div:
images= div.find_all('div', class_='article-image-wrapper')
for image in images:
image = image.find_all('div', class_='article-image')
for img in image:
title.append(img.a.img['title'])
date =[]
for div in movie_div:
date.append(div.find('div', class_='authorship type-date').text.strip())
info =[]
for div in movie_div:
info.append(div.find('div', class_='excerpt-text').text.strip())
import pandas as pd
movie = pd.DataFrame({'title':title, 'date':date, 'info':info}, index=None)
movie.head()
There is a backend api which serves up the HTML you are scraping you can see it in action if you open your browsers Developer Tools - Network tab - fetch/Xhr and click the on a the 2nd or 3rd page link, we can recreate the POST request with python like the below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
pages = 3
results_per_page = 500 #max 500 I think
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://www.nollywoodreinvented.com/wp-admin/admin-ajax.php'
output = []
for page in range(1,pages+1):
payload = {
'action':'itajax-sort',
'view':'grid',
'loop':'main loop',
'location':'',
'thumbnail':'1',
'rating':'1',
'meta':'1',
'award':'1',
'badge':'1',
'authorship':'1',
'icon':'1',
'excerpt':'1',
'sorter':'recent',
'columns':'4',
'layout':'full',
'numarticles':str(results_per_page),
'largefirst':'',
'paginated':str(page),
'currentquery[category__in][]':'2648',
'currentquery[category__in][]':'2649'
}
resp = requests.post(url,headers=headers,data=payload).json()
print(f'Scraping page: {page} - results: {results_per_page}')
soup = BeautifulSoup(resp['content'],'html.parser')
for film in soup.find_all('div',class_='article-panel'):
try:
title = film.find('h3').text.strip()
except AttributeError:
continue
date = datetime.strptime(film.find('span',class_='date').text.strip(),"%B %d, %Y").strftime('%Y-%m-%d')
likes = film.find('span',class_='numcount').text.strip()
if not likes:
likes = 0
full_stars = [1 for _ in film.find_all('span',class_='theme-icon-star-full')]
half_stars = [0.5 for _ in film.find_all('span',class_='theme-icon-star-half')]
stars = (sum(full_stars)+ sum(half_stars))/2.0
item = {
'title':title,
'date':date,
'likes':likes,
'stars':stars
}
output.append(item)
df= pd.DataFrame(output)
df.to_csv('nollywood_data.csv',index=False)
print('Saved to nollywood_data.csv')

how to scrape page inside the result card using Bs4?

<img class="no-img" data-src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium" alt="Biryani By Kilo" data-gatype="RestaurantImageClick" data-url="/delhi/biryani-by-kilo-connaught-place-central-delhi-40178" data-w-onclick="cardClickHandler" src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium">
page url - https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p=1
this page contains some restaurants card now while scrapping the page in the loop I want to go inside the restaurant card URL which is in the above HTML code name by data-url class and scrape the no. of reviews from inside it, I don't know how to do it my current code for normal front page scrapping is ;
def extract(page):
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = requests.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup): # function to scrape the page
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
title = item.find('a').text.strip() # restaurant name
loc = item.find('div', class_ = 'restnt-loc ellipsis').text.strip() # restaurant location
try: # used this try and except method because some restaurants are unrated and while scrpaping those we would run into an error
rating = item.find('div', class_="img-wrap").text
rating = (re.sub("[^0-9,.]", "", rating))
except:
rating = None
pricce = item.find('span', class_="double-line-ellipsis").text.strip() # price for biriyani
price = re.sub("[^0-9]", "", pricce)[:-1]
biry_del = {
'name': title,
'location': loc,
'rating': rating,
'price': price
}
rest_list.append(biry_del)
rest_list = []
for i in range(1,18):
print(f'getting page, {i}')
c = extract(i)
transform(c)
I hope you guys understood please ask in comment for any confusion.
It's not very fast but it looks like you can get all the details you want including the review count (not 232!) if you hit this backend api endpoint:
https://www.dineout.co.in/get_rdp_data_main/delhi/69676/restaurant_detail_main
import requests
from bs4 import BeautifulSoup
import pandas as pd
rest_list = []
for page in range(1,3):
print(f'getting page, {page}')
s = requests.Session()
url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}" # URL of the website
header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
r = s.get(url, headers=header)
soup = BeautifulSoup(r.content, 'html.parser')
divs = soup.find_all('div', class_ = 'restnt-card restaurant')
for item in divs:
code = item.find('a')['href'].split('-')[-1] # restaurant code
print(f'Getting details for {code}')
data = s.get(f'https://www.dineout.co.in/get_rdp_data_main/delhi/{code}/restaurant_detail_main').json()
info = data['header']
info.pop('share') #clean up csv
info.pop('options')
rest_list.append(info)
df = pd.DataFrame(rest_list)
df.to_csv('dehli_rest.csv',index=False)

Beautifulsoup parsing error

I am trying to extract some information about an App on Google Play and BeautifulSoup doesn't seem to work.
The link is this(say):
https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts
My code:
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html)
l = soup.find_all("div", { "class" : "document-subtitles"})
print len(l)
0 #How is this 0?! There is clearly a div with that class
I decided to go all in, didn't work either:
i = soup.select('html body.no-focus-outline.sidebar-visible.user-has-no-subscription div#wrapper.wrapper.wrapper-with-footer div#body-content.body-content div.outer-container div.inner-container div.main-content div div.details-wrapper.apps.square-cover.id-track-partial-impression.id-deep-link-item div.details-info div.info-container div.info-box-top')
print i
What am I doing wrong?
You need to pretend to be a real browser by supplying the User-Agent header:
import requests
from bs4 import BeautifulSoup
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})
html = r.content
soup = BeautifulSoup(html, "html.parser")
title = soup.find(class_="id-app-title").get_text()
rating = soup.select_one(".document-subtitle .star-rating-non-editable-container")["aria-label"].strip()
print(title)
print(rating)
Prints the title and the current rating:
Weird Facts
Rated 4.3 stars out of five stars
To get the additional information field values, you can use the following generic function:
def get_info(soup, text):
return soup.find("div", class_="title", text=lambda t: t and t.strip() == text).\
find_next_sibling("div", class_="content").get_text(strip=True)
Then, if you do:
print(get_info(soup, "Size"))
print(get_info(soup, "Developer"))
You will see printed:
1.4M
Email email#here.com

Categories