I am unable to get all the links present on the page - python

I am using beautifulsoup and urllib to extract a webpage , I have set the user agent and the cookie , and yet i fail to receive all the links from the webpage...
Heres my code :
import bs4 as bs
import urllib.request
import requests
#sauce = urllib.request.urlopen('https://github.com/search?q=javascript&type=Code&utf8=%E2%9C%93').read()
#soup = bs.BeautifulSoup(sauce,'lxml')
'''
session = requests.Session()
response = session.get(url)
print(session.cookies.get_dict())
'''
url = 'https://github.com/search?q=javascript&type=Code&utf8=%E2%9C%93'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Cookie' : '_gh_sess=eyJzZXNzaW9uX2lkIjoiMDNhMGI2NjQxZjY4Mjc1YmQ3ZjAyNmJiODM2YzIzMTUiLCJfY3NyZl90b2tlbiI6IlJJOUtrd3E3WVFOYldVUzkwdmUxZ0Z4MHZLN3M2eE83SzhIdVJTUFVsVVU9In0%3D--4485d36d4c86aec01cde254e34db68005193546e
logged_in: no'}
response = requests.get(url,headers=headers)
print(response.cookies)
soup = bs.BeautifulSoup(response.content,'lxml')
for url in soup.find_all('a'):
print(url.get('href'))
Is there something I'm missing? Inside a browser I get the links to all the code whereas in the script I get only a few of the links , none with the code...
The webpage opening perfectly in the browser...

Related

I am trying to get one element of a website but it prints "none" (Python Requests)

from bs4 import BeautifulSoup
import requests
url = "https://www.gamerdvr.com/gamer/cookz/videos"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
element = soup.find('span', id_="most-recorded")
print(element)
This always prints "none" but when I go to the website, I can see it. I even deleted all cookies and it's still there.
Without specifying a user agent, the site does not give you the tag you need.
from bs4 import BeautifulSoup
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
url = "https://www.gamerdvr.com/gamer/cookz/videos"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
element = soup.find('span', {'id': "most-recorded"}).get_text(strip=True)
print(element)
OUTPUT:
Fortnite

Element is not in response Python Requests

I would to scrape the last odds in archive from this page https://www.betexplorer.com/soccer/estonia/esiliiga/elva-flora-tallinn/Q9KlbwaJ/ but I can't get it with requests. How can I get it without interact with Selenium?
To trigger the archive odds page in the Developer Tools I need to hover on the odd.
Code
url = "https://www.betexplorer.com/archive-odds/4l4ubxv464x0xc78lr/14/"
headers = {
"Referer": "https://www.betexplorer.com",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
}
Json = requests.get(url, headers=headers).json()
As the site is being loaded by JavaScript, requests doesn't work. I have used selenium to load the page, extract the complete source code after everything is loaded.
Then used beautifulsoup to create a soup object to get required data.
From the source code you can see that the data-bid of the <tr> are what are being passed to get the odds data.
I extracted all the data-bid and passed them to the URL you've provided at the very end of your question one by one.
This code will get all the odds data in JSON format
import time
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
base_url = 'https://www.betexplorer.com/soccer/estonia/esiliiga/elva-flora-tallinn/Q9KlbwaJ/'
driver = webdriver.Chrome()
driver.get(base_url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
t = soup.find('table', attrs= {'id': 'sortable-1'})
trs = t.find('tbody').findAll('tr')
for i in trs:
data_bid = i['data-bid']
url = f"https://www.betexplorer.com/archive-odds/4l4ubxv464x0xc78lr/{data_bid}/"
headers = {"Referer": "https://www.betexplorer.com",'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'}
Json = requests.get(url, headers=headers).json()
# Do what you wish to do withe JSON data here....

Tried scraping group on for practice only. Why it returns less results?

I am new to Python scraping, so as part of the practice I was trying few other sites where often data wasn't returned at all, but when I checked Groupon, I found that urllib only returns the first 8 results, while there are 36 results on the browser page.
I am using urllib and BS4. below is the code
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
req = Request('https://www.groupon.com/browse/chicago?category=beauty-and-spas')
req.add_header('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36')
try:
with urlopen(req) as response:
htmlcontent = response.read().decode('utf-8')
except:
htmlcontent = None
soup = BeautifulSoup(htmlcontent, 'lxml')
all_links = soup.find('div', { 'id': 'pull-results' }).select('figure > div > a')
Can somebody please tell, what am I missing in the code to be able to extract all the data?
If this doesn't work or shouldn't work, then do we have selenium as the only option?
Try the following to get all the items and their links traversing next pages:
import requests
from bs4 import BeautifulSoup
base_link = 'https://www.groupon.com/browse/chicago?category=beauty-and-spas'
url = 'https://www.groupon.com/partial/browse/get-paginated-cards?'
params = {
'category': 'beauty-and-spas',
'page': 1
}
with requests.Session() as s:
s.headers['user-agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
r = s.get(base_link)
params['_csrf'] = r.cookies['_csrf']
while True:
print("current page----------->",params['page'])
res = s.get(url,params=params)
soup = BeautifulSoup(res.json()[0]['cardSlotHtml'],"lxml")
if not soup.select_one("figure[data-pingdom-info='purchasable-deal']"):break
for item in soup.select("figure[data-pingdom-info='purchasable-deal']"):
item_title = item.select_one(".cui-udc-details .cui-udc-title").get_text(strip=True)
item_link = item.select_one(".cui-content > a[href]").get("href")
print(item_title,item_link)
params['page']+=1
The other 28 links are loaded dynamically, therefore urllib doesn't support it. However, you can scrape them by sending a GET request to:
https://www.groupon.com/partial/browse/get-lazy-loaded-cards?category=beauty-and-spas&_csrf=P6rFPl1o-xDta8uOABKo_9LOiUajyK9bieMg
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
req = Request(
"https://www.groupon.com/partial/browse/get-lazy-loaded-cards?category=beauty-and-spas&_csrf=P6rFPl1o-xDta8uOABKo_9LOiUajyK9bieMg"
)
req.add_header(
"User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
)
with urlopen(req) as response:
htmlcontent = response.read().decode("utf-8")
soup = BeautifulSoup(htmlcontent, "lxml")
for tag in soup.find_all(class_=r'\"cui-content\"'):
try:
link = tag.find("a")["href"]
except TypeError:
continue
print(link.replace('\\"', "").replace("\\n", ""))
Output:
https://www.groupon.com/deals/yupo-health-2?deal_option=75076cab-9268-4c2d-9459-e79a559dfed6
https://www.groupon.com/deals/infinity-laser-spa-2-4?deal_option=4dc14f7d-29ac-45e1-a664-ea76ddc44718
https://www.groupon.com/deals/dr-laser-nyc-3?deal_option=232325a0-8b3f-4fa9-b42b-7dc668ff7474
...
...

How to do scraping from a page with BeautifulSoup

The question asked is very simple, but for me, it doesn't work and I don't know!
I want to scrape the rating beer from this page https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone with BeautifulSoup, but it doesn't work.
This is my code:
import requests
import bs4
from bs4 import BeautifulSoup
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
test_html = requests.get(url).text
soup = BeautifulSoup(test_html, "lxml")
rating = soup.findAll("span", class_="ratingValue")
rating
When I finish, it doesn't work, but if I do the same thing with another page is work... I don't know. Someone can help me? The result of rating is 4.58
Thanks everybody!
If you print the test_html, you'll find you get a 403 forbidden response.
You should add a header (at least a user-agent : ) ) to your GET request.
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
print(rating.text)
# 4.58
The reason behind getting forbidden status code (HTTP error 403) which means the server will not fulfill your request despite understanding the response. You will definitely get this error if you try scrape a lot of the more popular websites which will have security features to prevent bots. So you need to disguise your request!
For that you need use Headers.
Also you need correct your tag attribute whose data you're trying to get i.e. itemprop
use lxml as your tree builder, or any other of your choice
import requests
from bs4 import BeautifulSoup
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone'
# Add this
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'lxml')
rating = soup.find('span', {'itemprop':'ratingValue'})
print(rating.text)
The page you are requesting response as 403 forbidden so you might not be getting an error but it will provide you blank result as []. To avoid this situation we add user agent and this code will get you the desired result.
import urllib.request
from bs4 import BeautifulSoup
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = "https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone"
headers={'User-Agent':user_agent}
request=urllib.request.Request(url,None,headers) #The assembled request
response = urllib.request.urlopen(request)
soup = BeautifulSoup(response, "lxml")
rating = soup.find('span', {'itemprop':'ratingValue'})
rating.text
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36
(KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
url = 'https://www.brewersfriend.com/homebrew/recipe/view/16367/southerntier-pumking
clone'
test_html = requests.get(url, headers=headers).text
soup = BeautifulSoup(test_html, 'html5lib')
rating = soup.find('span', {'itemprop': 'ratingValue'})
print(rating.text)
you are facing this error because some websites can't be scraped by beautiful soup. So for these kinds of websites, you have to use selenium
download latest chrome driver from this link according to your operating system
install selenium driver by this command "pip install selenium"
# import required modules
import selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time, os
curren_dir = os.getcwd()
print(curren_dir)
# concatinate web driver with your current dir && if you are using window change "/" to '\' .
# make sure , you placed chromedriver in current directory
driver = webdriver.Chrome(curren_dir+'/chromedriver')
# driver.get open url on your browser
driver.get('https://www.brewersfriend.com/homebrew/recipe/view/16367/southern-tier-pumking-clone')
time.sleep(1)
# it fetch data html data from driver
super_html = driver.page_source
# now convert raw data with 'html.parser'
soup=BeautifulSoup(super_html,"html.parser")
rating = soup.findAll("span",itemprop="ratingValue")
rating[0].text

How to crawl website slower in Python with Jupyter Notebook?

My current python script would perform web scraping on the website in one second with 2 pages. I want to make it go slower, like 25 seconds on one page. How do I do that?
I tried this following python script.
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
# Testing
linked = 'https://www.zillow.com/homes/for_sale/San-Francisco-CA/fsba,fsbo,fore,new_lt/house_type/20330_rid/globalrelevanceex_sort/37.859675,-122.285557,37.690612,-122.580815_rect/11_zm/{}_p/0_mmm/'
for link in [linked.format(page) for page in range(1,2)]:
user_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
headers = {'User-Agent': user_agent}
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.text, 'html.pafinite-item')
print(soup)
What should I add to my script to make the web scraping go slower?
Just use time.sleep:
import requests
import pandas as pd
from time import sleep
from bs4 import BeautifulSoup
linked = 'https://www.zillow.com/homes/for_sale/San-Francisco-CA/fsba,fsbo,fore,new_lt/house_type/20330_rid/globalrelevanceex_sort/37.859675,-122.285557,37.690612,-122.580815_rect/11_zm/{}_p/0_mmm/'
for link in [linked.format(page) for page in range(1,2)]:
sleep(25.0)
user_agent = 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
headers = {'User-Agent': user_agent}
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.text, 'html.pafinite-item')
print(soup)

Categories