Can't scrape names from next pages using requests - python

I'm trying to parse names traversing multiple pages from a webpage using a python script. With my current attempt I can get the names from it's landing page. However, I can't find any idea to fetch the names from next pages as well using requests and BeautifulSoup.
website link
My attempt so far:
import requests
from bs4 import BeautifulSoup
url = "https://proximity.niceic.com/mainform.aspx?PostCode=YO95"
with requests.Session() as s:
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
for elem in soup.select("table#gvContractors tr:has([id*='_lblName'])"):
name = elem.select_one("span[id*='_lblName']").get_text(strip=True)
print(name)
I've tried to modify my script to get only the content from the second page to make sure it is working when there is a next page button involved but unfortunately it still fetches data from the first page:
import requests
from bs4 import BeautifulSoup
url = "https://proximity.niceic.com/mainform.aspx?PostCode=YO95"
with requests.Session() as s:
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['__EVENTARGUMENT'] = 'Page$Next'
payload.pop('btnClose')
payload.pop('btnMapClose')
res = s.post(url,data=payload,headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'X-Requested-With':'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'https://proximity.niceic.com/mainform.aspx?PostCode=YO95',
})
sauce = BeautifulSoup(res.text,"lxml")
for elem in sauce.select("table#gvContractors tr:has([id*='_lblName'])"):
name = elem.select_one("span[id*='_lblName']").get_text(strip=True)
print(name)

Navigating to next page is being performed via POST request with __VIEWSTATE cursor.
How you can do it with requests:
Make GET request to first page;
Parse required data and __VIEWSTATE cursor;
Prepare POST request for next page with received cursor;
Run it, parse all data and new cursor for next page.
I won't provide any code, because it requires to write down almost all crawler's code.
==== Added ====
You almost done it, but there are two important things you have missed.
It is necessary to send headers with first GET request. If there're no headers sent - we get broken tokens (it is easy to detect visually - they haven't == at the end)
We need to add __ASYNCPOST to payload we send. (It is very interesting: it is not a boolean True, it is a string 'true')
Here's code. I removed bs4 and added lxml (i don't like bs4, it is very slow). We exactly know which data we need to send, so let's parse only few inputs.
import re
import requests
from lxml import etree
def get_nextpage_tokens(response_body):
""" Parse tokens from XMLHttpRequest response for making next request to next page and create payload """
try:
payload = dict()
payload['ToolkitScriptManager1'] = 'UpdatePanel1|gvContractors'
payload['__EVENTTARGET'] = 'gvContractors'
payload['__EVENTARGUMENT'] = 'Page$Next'
payload['__VIEWSTATEENCRYPTED'] = ''
payload['__VIEWSTATE'] = re.search(r'__VIEWSTATE\|([^\|]+)', response_body).group(1)
payload['__VIEWSTATEGENERATOR'] = re.search(r'__VIEWSTATEGENERATOR\|([^\|]+)', response_body).group(1)
payload['__EVENTVALIDATION'] = re.search(r'__EVENTVALIDATION\|([^\|]+)', response_body).group(1)
payload['__ASYNCPOST'] = 'true'
return payload
except:
return None
if __name__ == '__main__':
url = "https://proximity.niceic.com/mainform.aspx?PostCode=YO95"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': 'https://proximity.niceic.com/mainform.aspx?PostCode=YO95',
}
with requests.Session() as s:
page_num = 1
r = s.get(url, headers=headers)
parser = etree.HTMLParser()
tree = etree.fromstring(r.text, parser)
# Creating payload
payload = dict()
payload['ToolkitScriptManager1'] = 'UpdatePanel1|gvContractors'
payload['__EVENTTARGET'] = 'gvContractors'
payload['__EVENTARGUMENT'] = 'Page$Next'
payload['__VIEWSTATE'] = tree.xpath("//input[#name='__VIEWSTATE']/#value")[0]
payload['__VIEWSTATEENCRYPTED'] = ''
payload['__VIEWSTATEGENERATOR'] = tree.xpath("//input[#name='__VIEWSTATEGENERATOR']/#value")[0]
payload['__EVENTVALIDATION'] = tree.xpath("//input[#name='__EVENTVALIDATION']/#value")[0]
payload['__ASYNCPOST'] = 'true'
headers['X-Requested-With'] = 'XMLHttpRequest'
while True:
page_num += 1
res = s.post(url, data=payload, headers=headers)
print(f'page {page_num} data: {res.text}') # FIXME: Parse data
payload = get_nextpage_tokens(res.text) # Creating payload for next page
if not payload:
# Break if we got no tokens - maybe it was last page (it must be checked)
break
Important
Response not a well formed HTML. So You have to deal with it: cut table or something else. Good luck!

Related

requests-html not finding page element

So I'm trying to navigate to this url: https://www.instacart.com/store/wegmans/search_v3/horizon%201%25
and scrape data from the div with the class item-name item-row. There are two main problems though, the first is that instacart.com requires a login before you can get to that url, and the second is that most of the page is generated with javascript.
I believe I've solved the first problem because my session.post(...) gets a 200 response code. I'm also pretty sure that r.html.render() is supposed to solve the second problem by rendering the javascript generated html before I scrape it. Unfortunately, the last line in my code is only returning an empty list, despite the fact that selenium had no problem getting this element. Does anyone know why this isn't workng?
from requests_html import HTMLSession
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
session = HTMLSession()
res1 = session.get('http://www.instacart.com', headers=headers)
soup = BeautifulSoup(res1.content, 'html.parser')
token = soup.find('meta', {'name': 'csrf-token'}).get('content')
data = {"user": {"email": "alexanderjbusch#gmail.com", "password": "password"},
"authenticity_token": token}
response = session.post('https://www.instacart.com/accounts/login', headers=headers, data=data)
print(response)
r = session.get("https://www.instacart.com/store/wegmans/search_v3/horizon%201%25", headers=headers)
r.html.render()
print(r.html.xpath("//div[#class='item-name item-row']"))
After logging in using requests module and BeautifulSoup, you can make use of the link I've already suggested in the comment to parse the required data available within json. The following script should get you name, quantity, price and a link to the concerning product. You can only get 21 product using the script below. There is an option for pagination within this json content. You can get all of the products by playing around with that pagination.
import json
import requests
from bs4 import BeautifulSoup
baseurl = 'https://www.instacart.com/store/'
data_url = "https://www.instacart.com/v3/retailers/159/module_data/dynamic_item_lists/cart_starters/storefront_canonical?origin_source_type=store_root_department&tracking.page_view_id=b974d56d-eaa4-4ce2-9474-ada4723fc7dc&source=web&cache_key=df535d-6863-f-1cd&per=30"
data = {"user": {"email": "alexanderjbusch#gmail.com", "password": "password"},
"authenticity_token": ""}
headers = {
'user-agent':'Mozilla/5.0',
'x-requested-with': 'XMLHttpRequest'
}
with requests.Session() as s:
res = s.get('https://www.instacart.com/',headers={'user-agent':'Mozilla/5.0'})
soup = BeautifulSoup(res.text, 'lxml')
token = soup.select_one("[name='csrf-token']").get('content')
data["authenticity_token"] = token
s.post("https://www.instacart.com/accounts/login",json=data,headers=headers)
resp = s.get(data_url, headers=headers)
for item in resp.json()['module_data']['items']:
name = item['name']
quantity = item['size']
price = item['pricing']['price']
product_page = baseurl + item['click_action']['data']['container']['path']
print(f'{name}\n{quantity}\n{price}\n{product_page}\n')
Partial output:
SB Whole Milk
1 gal
$3.90
https://www.instacart.com/store/items/item_147511418
Banana
At $0.69/lb
$0.26
https://www.instacart.com/store/items/item_147559922
Yellow Onion
At $1.14/lb
$0.82
https://www.instacart.com/store/items/item_147560764

Python Screen Scraping Forbes.com

I'm writing a Python program to extract and store metadata from interesting online tech articles: "og:title", "og:description", "og:image", og:url, and og:site_name.
This is the code I'm using...
# Setup Headers
headers = {}
headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
headers['Accept-Charset'] = 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'
headers['Accept-Encoding'] = 'none'
headers['Accept-Language'] = "en-US,en;q=0.8"
headers['Connection'] = 'keep-alive'
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"
# Create the Request
http = urllib3.PoolManager()
# Create the Response
response = http.request('GET ', url, headers)
# BeautifulSoup - Construct
soup = BeautifulSoup(response.data, 'html.parser')
# Scrape <meta property="og:title" content=" x x x ">
if tag.get("property", None) == "og:title":
if len(tag.get("content", None)) > len(title):
title = tag.get("content", None)
The program runs fine on all but one site. On "forbes.com", I can't get to the articles using Python:
url=
https://www.forbes.com/consent/?toURL=https://www.forbes.com/sites/shermanlee/2018/07/31/privacy-revolution-how-blockchain-is-reshaping-our-economy/#72c3b4e21086
I can't bypass this consent page; which seems to be the "Cookie Consent Manager" solution from "TrustArc". On a computer, you basically provide your consent... and each consecutive run, you're able to access the articles.
If I reference the "toURL" url:
https://www.forbes.com/sites/shermanlee/2018/07/31/privacy-revolution-how-blockchain-is-reshaping-our-economy/#72c3b4e21086
And bypass the "https://www.forbes.com/consent/" page, I'm redirected back to this page.
I've tried to see if there is a cookie I could set in the header, but couldn't find the magic key.
Can anyone help me?
There is a required cookie notice_gdpr_prefs that needs to be sent to view the data :
import requests
from bs4 import BeautifulSoup
src = requests.get(
"https://www.forbes.com/sites/shermanlee/2018/07/31/privacy-revolution-how-blockchain-is-reshaping-our-economy/",
headers= {
"cookie": "notice_gdpr_prefs"
})
soup = BeautifulSoup(src.content, 'html.parser')
title = soup.find("meta", property="og:title")
print(title["content"])

Requests in python return error, while opening link manually works perfect

import requests
a = 'http://tmsearch.uspto.gov/bin/showfield?f=toc&state=4809%3Ak1aweo.1.1&p_search=searchstr&BackReference=&p_L=100&p_plural=no&p_s_PARA1={}&p_tagrepl%7E%3A=PARA1%24MI&expr=PARA1+or+PARA2&p_s_PARA2=&p_tagrepl%7E%3A=PARA2%24ALL&a_default=search&f=toc&state=4809%3Ak1aweo.1.1&a_search=Submit+Query'
a = a.format('coca-cola')
b = requests.get(a)
print(b.text)
print(b.url)
If you copy the printed url and paste it in browser, site will open with no problem, but if you do requests.get, i get some token? errors. Is there anything I can do?
VIA requests.get I url back, but no data if doing manually. It says: <html><head><TITLE>TESS -- Error</TITLE></head><body>
First of all, make sure you follow the website's Terms of Use and usage policies.
This is a little bit more complicated that it may seem. You need to maintain a certain state throughout the [web-scraping session][1]. And, you'll need an HTML parser, like BeautifulSoup along the way:
from urllib.parse import parse_qs, urljoin
import requests
from bs4 import BeautifulSoup
SEARCH_TERM = 'coca-cola'
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
# get the current search state
response = session.get("https://tmsearch.uspto.gov/")
soup = BeautifulSoup(response.content, "html.parser")
link = soup.find("a", text="Basic Word Mark Search (New User)")["href"]
session.get(urljoin(response.url, link))
state = parse_qs(link)['state'][0]
# perform a search
response = session.post("https://tmsearch.uspto.gov/bin/showfield", data={
'f': 'toc',
'state': state,
'p_search': 'search',
'p_s_All': '',
'p_s_ALL': SEARCH_TERM + '[COMB]',
'a_default': 'search',
'a_search': 'Submit'
})
# print search results
soup = BeautifulSoup(response.content, "html.parser")
print(soup.find("font", color="blue").get_text())
table = soup.find("th", text="Serial Number").find_parent("table")
for row in table('tr')[1:]:
print(row('td')[1].get_text())
It prints all the serial number values from the first search results page, for demonstration purposes.

How can I parse long web pages with beautiful soup?

I have been using following code to parse web page in the link https://www.blogforacure.com/members.php. The code is expected to return the links of all the members of the given page.
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('https://www.blogforacure.com/members.php').read()
soup = BeautifulSoup(r,'lxml')
headers = soup.find_all('h3')
print(len(headers))
for header in headers:
a = header.find('a')
print(a.attrs['href'])
But I get only the first 10 links from the above page. Even while printing the prettify option I see only the first 10 links.
The results are dynamically loaded by making AJAX requests to the https://www.blogforacure.com/site/ajax/scrollergetentries.php endpoint.
Simulate them in your code with requests maintaining a web-scraping session:
from bs4 import BeautifulSoup
import requests
url = "https://www.blogforacure.com/site/ajax/scrollergetentries.php"
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
session.get("https://www.blogforacure.com/members.php")
page = 0
members = []
while True:
# get page
response = session.post(url, data={
"p": str(page),
"id": "#scrollbox1"
})
html = response.json()['html']
# parse html
soup = BeautifulSoup(html, "html.parser")
page_members = [member.get_text() for member in soup.select(".memberentry h3 a")]
print(page, page_members)
members.extend(page_members)
page += 1
It prints the current page number and the list of members per page accumulating member names into a members list. Not posting what it prints since it contains names.
Note that I've intentionally left the loop endless, please figure out the exit condition. May be when response.json() throws an error.

POST URL Encoded vs Line-based text data via Python Requests

I'm trying to scrape some data from a website and I can't get the POST to work, it acts as though I didn't give it the input data ("appnote").
When I examine the POST data it looks relatively the same except that the actual webform's POST is called "URL Encoded" and lists each form input, whereas mine is labeled "Line-based text data".
Here's my code: (appnote) and Search (Search) are the most relevant pieces I need
import requests
import cookielib
jar = cookielib.CookieJar()
url = 'http://www.vivotek.com/faq/'
headers = {'content-type': 'application/x-www-form-urlencoded'}
post_data = {#'__EVENTTARGET':'',
#'__EVENTARGUMENT':'',
'__LASTFOCUS':'',
'__VIEWSTATE':'',
'__VIEWSTATEGENERATOR':'',
'__VIEWSTATEENCRYPTED':'',
'__PREVIOUSPAGE':'',
'__EVENTVALIDATION':''
'ctl00$HeaderUc1$LanguageDDLUc1$ddlLanguage':'en',
'ctl00$ContentPlaceHolder1$CategoryDDLUc1$DropDownList1':'-1',
'ctl00$ContentPlaceHolder1$ProductDDLUc1$DropDownList1':'-1',
'ctl00$ContentPlaceHolder1$Content':'appnote',
'ctl00$ContentPlaceHolder1$Search':'Search'
}
response = requests.get(url, cookies=jar)
response = requests.post(url, cookies=jar, data=post_data, headers=headers)
print(response.text)
Links to images of what I'm talking about in Wireshark:
Wireshark Form
Wireshark Line
I also tried it using wget with the same results.
The main problem is that you are not setting the important hidden field values, like __VIEWSTATE.
For this to work using requests, you need to parse the page html and get the appropriate input values.
Here's the solution using BeautifulSoup HTML parser and requests:
from bs4 import BeautifulSoup
import requests
url = 'http://www.vivotek.com/faq/'
query = 'appnote'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36'}
session = requests.Session()
response = session.get(url, headers=headers)
soup = BeautifulSoup(response.content)
post_data = {'__EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__LASTFOCUS':'',
'__VIEWSTATE': soup.find('input', id='__VIEWSTATE')['value'],
'__VIEWSTATEGENERATOR': soup.find('input', id='__VIEWSTATEGENERATOR')['value'],
'__VIEWSTATEENCRYPTED': '',
'__PREVIOUSPAGE': soup.find('input', id='__PREVIOUSPAGE')['value'],
'__EVENTVALIDATION': soup.find('input', id='__EVENTVALIDATION')['value'],
'ctl00$HeaderUc1$LanguageDDLUc1$ddlLanguage': 'en',
'ctl00$ContentPlaceHolder1$CategoryDDLUc1$DropDownList1': '-1',
'ctl00$ContentPlaceHolder1$ProductDDLUc1$DropDownList1': '-1',
'ctl00$ContentPlaceHolder1$Content': query,
'ctl00$ContentPlaceHolder1$Search': 'Search'
}
response = session.post(url, data=post_data, headers=headers)
soup = BeautifulSoup(response.content)
for item in soup.select('a#ArticleShowLink'):
print item.text.strip()
Prints the specific results for the appnote query:
How to troubleshoot when you can't watch video streaming?
Recording performance benchmarking tool
...

Categories