Beautifulsoup can't extract data in this website

Beautifulsoup can't extract data in this website - python

import requests
from bs4 import BeautifulSoup
import lxml
import urllib2
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
f =open('ala2009link.csv','r')
s=open('2009alanews.csv','w')
for row in csv.reader(f):
url=row[0]
print url
res = requests.get(url)
print res.content
soup = BeautifulSoup(res.content)
print soup
data=soup.find_all("article",{"class":"article-wrapper news"})
#data=soup.find_all("main",{"class":"main-content"})
for item in data:
title= item.find_all("h2",{"class","article-headline"})[0].text
s.write("%s \n"% title)
content=soup.find_all("p")
for main in content:
k=main.text.encode('utf-8')
s.write("%s \n"% k)
#k=csv.writer(s)
#k.writerow('%s\n'% (main))
s.close()
f.close()
this is my code to extract data in website ,but i don't know why i can't extract data ,is this ad blocker warning to block my beautifulsoup ?
this is the example link:http://www.rolltide.com/news/2009/6/23/Bert_Bank_Passes_Away.aspx?path=football

The reason that no results are returned is because this website requires that you have a User-Agent header in your request.
To fix this add a headers parameter with a User-Agent to the requests.get() like so.
url = 'http://www.rolltide.com/news/2009/6/23/Bert_Bank_Passes_Away.aspx?path=football'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/29.0.1547.65 Chrome/29.0.1547.65 Safari/537.36',
}
res = requests.get(url, headers=headers)

Related

Can't scrape listing links from a webpage using the requests module

I'm trying to scrape different listings for this search Oxford, Oxfordshire from this webpage using requests module. This is how the inputbox looks before I click the search button.
I've defined an accurate selector to locate the listings, but the script fails to grab any data.
import requests
from pprint import pprint
from bs4 import BeautifulSoup
link = 'https://www.zoopla.co.uk/search/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,bn;q=0.8',
'Referer': 'https://www.zoopla.co.uk/for-sale/',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
params = {
'view_type': 'list',
'section': 'for-sale',
'q': 'Oxford, Oxfordshire',
'geo_autocomplete_identifier': 'oxford',
'search_source': 'home'
}
res = requests.get(link,params=params,headers=headers)
soup = BeautifulSoup(res.text,"html5lib")
for item in soup.select("[id^='listing'] a[href^='/for-sale/details/']:has(h2[data-testid='listing-title'])"):
print(item.get("href"))
EDIT:
If I try something like the following, the script seems to be working flawlessly. The only and main problem is that I had to use hardcoded cookies within the headers, which expire within a few minutes.
import json
from pprint import pprint
from bs4 import BeautifulSoup
import cloudscraper
base = 'https://www.zoopla.co.uk{}'
link = 'https://www.zoopla.co.uk/for-sale/'
url = 'https://www.zoopla.co.uk/for-sale/property/oxford/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'cookie': 'ajs_anonymous_id=caa7072ed7f64911a51dda2b525a3ca3; zooplapsid=cafe5156dd8f4cdda14e748c9270f623; base_device_id=68f7b6b7-27b8-429e-af66-366a4b64bac4; g_state={"i_p":1675619616576,"i_l":2}; zid=31173482e60549da9ccc1632e52a264c; zooplasid=31173482e60549da9ccc1632e52a264c; base_session_start_page=https://www.zoopla.co.uk/; base_request=https://www.zoopla.co.uk/; base_session_id=2315eaf2-6d59-4075-aeaa-6288af3efef7; base_session_count=8; forced_features={}; forced_experiments={}; active_session=anon; _gid=GA1.3.821027055.1675853830; __cf_bm=6.bEGFdT2vYz3G3iO7swuTFwSfhyzA0DvGoCjB6KvVg-1675853990-0-AQqWHydhL+/hqq8KRqOpCKDNtd6E96qjLgyOF77S8f7DpqCbMFoxAycD8ahQd7FOShSq0oHD//gpDj095eQPdtccDyZ0qu6GvxiSpjNP0+D7sblJP1e3Mlmxw5YroG3O4OuJHgBco3zThrx2SRyVDfx7M1zNlwi/1OVfww/u2wfb5DCW+gGz1b18zEvpNRszYQ==; cookie_consents={"schemaVersion":4,"content":{"brand":1,"consents":[{"apiVersion":1,"stored":false,"date":"Wed, 08 Feb 2023 10:59:02 GMT","categories":[{"id":1,"consentGiven":true},{"id":3,"consentGiven":false},{"id":4,"consentGiven":false}]}]}}; _ga=GA1.3.1980576228.1675275335; _ga_HMGEC3FKSZ=GS1.1.1675853830.7.1.1675853977.0.0.0'
}
params = {
'q': 'Oxford, Oxfordshire',
'search_source': 'home',
'pn': 1
}
scraper = cloudscraper.create_scraper()
res = scraper.get(url,params=params,headers=headers)
print(res.status_code)
soup = BeautifulSoup(res.text,"lxml")
container = soup.select_one("script[id='__NEXT_DATA__']").contents[0]
items = json.loads(container)['props']['pageProps']['initialProps']['regularListingsFormatted']
for item in items:
print(item['address'],base.format(item['listingUris']['detail']))
How can I get content from that site without using hardcoded cookies within the headers?

The following code example is working smoothly without adding headers and params parameters. The website's data isn't dynamic meaning you can grab the required data from the the static HTML dom but the main hindrance is that they are using Cloudflare protection. So to get rid of such restiction you can use either cloudscraper instead of requests module or selenium. Here I use cloudscraper and it's working fine.
Script:
import pandas as pd
from bs4 import BeautifulSoup
import cloudscraper
scraper = cloudscraper.create_scraper()
kw= ['Oxford', 'Oxfordshire']
data = []
for k in kw:
for page in range(1,3):
url = f"https://www.zoopla.co.uk/for-sale/property/oxford/?search_source=home&q={k}&pn={page}"
page = scraper.get(url)
#print(page)
soup = BeautifulSoup(page.content, "html.parser")
for card in soup.select('[data-testid="regular-listings"] [id^="listing"]'):
link = "https://www.zoopla.co.uk" + card.a.get("href")
print(link)
#data.append({'link':link})
# df = pd.DataFrame(data)
# print(df)
Output:
https://www.zoopla.co.uk/for-sale/details/63903233/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63898182/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63898168/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63898177/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63897930/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63897571/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63896910/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63896858/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63896815/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63893187/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/47501048/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63891727/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63890876/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63889459/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63888298/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63887586/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63887525/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/59469692/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63882084/?search_identifier=4bf3e1bf22483c835fd89a5f17e16e2d
https://www.zoopla.co.uk/for-sale/details/63878480/?search_identifier=cbe92a4f0868061e26dff87f97442c6a
https://www.zoopla.co.uk/for-sale/details/63877980/?search_identifier=cbe92a4f0868061e26dff87f97442c6a
... so on

You could just set the browser type and read the contents with a simple request:
# Url for 'Oxford, Oxfordshire'
url = 'https://www.zoopla.co.uk/for-sale/property/oxford/?q=Oxford%2C%20Oxfordshire&search_source=home'
result = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urllib.request.urlopen(result).read()
print(webpage)
This also works just fine. The only thing is that you will have to write a couple lines of code to extract what exactly you want from each listing yourself. Or make the class field dynamic if necessary.
import urllib.request
from bs4 import BeautifulSoup
url = 'https://www.zoopla.co.uk/for-sale/property/oxford/?q=Oxford%2C%20Oxfordshire&search_source=home'
result = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urllib.request.urlopen(result).read()
soup = BeautifulSoup(webpage, "html.parser")
webpage_listings = soup.find_all("div", class_="f0xnzq0")
if webpage_listings:
for item in webpage_listings:
print(item)
else:
print("Empty list")

I am trying to get one element of a website but it prints "none" (Python Requests)

from bs4 import BeautifulSoup
import requests
url = "https://www.gamerdvr.com/gamer/cookz/videos"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
element = soup.find('span', id_="most-recorded")
print(element)
This always prints "none" but when I go to the website, I can see it. I even deleted all cookies and it's still there.

Without specifying a user agent, the site does not give you the tag you need.
from bs4 import BeautifulSoup
import requests
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
url = "https://www.gamerdvr.com/gamer/cookz/videos"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
element = soup.find('span', {'id': "most-recorded"}).get_text(strip=True)
print(element)
OUTPUT:
Fortnite

How do I retrieve the text between those

https://imgur.com/a/JcTnbiw
how do I retrieve the highlighted text with beautifulsoup?
a example would be the best answer, thank you ;)
edit; heres the code
import requests
import pyperclip
from bs4 import BeautifulSoup
import time
url = 'https://sales.elhst.co/'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36"}
site = requests.get(url, headers=headers)
site = str(site)
if site == "<Response [200]>":
print("Site is up..")
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
time.sleep(2)
target = soup.find("pp", id="copies")
print(target)
and the output is:
Site is up..
<pp id="copies"></pp>
and i wanna to get this text:
https://imgur.com/a/JcTnbiw
is there any way to do it?

The data you see on the page is loaded from external URL. You can try this script to print number of copies:
import re
import json
import requests
url = 'https://sales.elhst.co/socket.io/?EIO=3&transport=polling'
copies_url = 'https://sales.elhst.co/socket.io/?EIO=3&transport=polling&sid={sid}'
r = requests.get(url).text
sid = json.loads(re.search(r'(\{".*)', r).group(1))['sid']
r = requests.get(copies_url.format(sid=sid)).text
copies = json.loads(re.search(r'(\[".*)', r).group(1))[-1]
print(copies)
Prints:
0

from lxml import html
import requests
page = requests.get('http://url')
tree = html.fromstring(page.content)
#This will extract the text you need
buyers = tree.xpath('//pp[#id="copies"]/text()')
It should work. But I don't know pp tag. I think it's a mistake and there should be tag <p>.
More info about lxml here.

Scraping AJAX page with requests

I would like to scrape the results of this booking flow.
By looking at the network tab I've found out that the data is retrieved with an AJIAX GET at this URL:
https://shop.caremar.it/main_acquista_1_corse_00_ajax.asp?l=it&data=24/02/2019&portoP=3&portoA=5&form_url=ticket_s1_2
I've build the URL passing the parameters as follows:
params = urllib.parse.urlencode({
'data': '24/02/2019',
'portoP': '3' ,
'portoA': '5',
'form_url': 'ticket_s1_2',
})
and make the request:
caremar_timetable_url = "https://shop.caremar.it/main_acquista_1_corse_00_ajax.asp?l=it&"
print(f"https://shop.caremar.it/main_acquista_1_corse_00_ajax.asp?l=it&{params}")
headers = {'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.3'}
res = requests.get(caremar_timetable_url,headers=headers, params=params)
soup = BeautifulSoup(res.text,'html.parser')
print(soup.text)
Output
https://shop.caremar.it/main_acquista_1_corse_00_ajax.asp?l=it&data=24%2F02%2F2019&portoP=7&portoA=1&form_url=ticket_s1_2
Non Ã¨ stato possibile procedere con l'acquisto del biglietto online. Si prega di riprovare
The response is an error message from the site which says it can't complete the booking. If I copy and paste the URL I created in the browser I get an unstyled HTML page with the data I need.
Why is this and how can I overcome it?

Data seems to come back with requests
import requests
from bs4 import BeautifulSoup as bs
url = 'https://shop.caremar.it/main_acquista_1_corse_00_ajax.asp?l=it&data=27/02/2019&portoP=1&portoA=4&form_url=ticket_s1_2'
res = requests.get(url)
soup = bs(res.content, 'lxml')
print(soup.select_one('html'))

Can`t crawl web site

I want to crawl , but i have some troubles in it, i need to open every link of the good and get information of it and save it in .html every good at the page
for now i can only print all links on the page
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
import requests
import urllib3
import ssl
from requests import request
urllib3.disable_warnings()
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
PYTHONHTTPSVERIFY=0
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
headers = {'User-Agent': user_agent}
t = request('GET', url=my_url, headers=headers, verify=False).text
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div", {"class": 'product'})
filename = "web.html"
f= open(filename, "w")
for containers in page_soup.findAll('div', attrs={'class': 'product'}):
f.write(containers.a['href'] + '\n')
f.close()

You appear to be trying to get a list of URLs from a first URL and then wanting to get some information from each of these URLs. To do this, a request for each URL needs to be made and a separate BeautifulSoup parse is needed for each.
Once you have the sub page, information can then be extracted, for example the name of the product and its price.
Finally you could either print this information or write it to a file. The easiest way is to write it as a CSV file. In this example I show how you could write the URL and name and the price as a single row. The CSV library automatically formats correctly:
from urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
import requests
import csv
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
my_url = 'https://franke-market.com.ua/moyki.html?on_page=100'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
req = requests.get(my_url, headers=headers, verify=False)
soup = BeautifulSoup(req.content, "html.parser")
with open("products.csv", "w", newline="") as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['URL', 'Title', 'Price'])
# Find the URLs for all the products
for div in soup.find_all('div', attrs={'class': 'product'})[:5]:
url = div.a['href']
print(url)
# For each URL, get the sub page and get the name and price
req_sub = requests.get(url, headers=headers, verify=False)
soup_sub = BeautifulSoup(req_sub.content, "html.parser")
title = soup_sub.find('h1', class_='title').text
price_info = soup_sub.find('div', class_='price_info').span.text
# Write the url, name and price as a CSV file
csv_output.writerow([url, title, price_info])
Giving you and output.csv file starting:
URL,Title,Price
https://franke-market.com.ua/franke_rol_610-38_101_0267_707_.html,Franke ROL 610-38 (101.0267.707),91795
https://franke-market.com.ua/franke-pmn-611i-101.0255.790.html,Franke Pamira PMN 611i (101.0255.790),57935
https://franke-market.com.ua/franke_pxl_611-60_101_0330_655_.html,Franke PXL 611-60 (101.0330.655),93222
https://franke-market.com.ua/franke-ron-610-41-101.0255.783.html,Franke Ron 610-41 (101.0255.783),57939
https://franke-market.com.ua/franke_pxl_611-78_101_0330_657_.html,Franke PXL 611-78 (101.0330.657),93223
You could then open this file into a spreadsheet application.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Beautifulsoup can't extract data in this website - python

Related

Can't scrape listing links from a webpage using the requests module

I am trying to get one element of a website but it prints "none" (Python Requests)

How do I retrieve the text between those

Scraping AJAX page with requests

Can`t crawl web site

Categories

Resources