I have tried to scrape data from http://www.educationboardresults.gov.bd/ with python and BeautifulSoup.
Firstly, website need to fill the form. After filling the form the website provide results. I have attached two image here.
Before Submitting Form: https://prnt.sc/w4lo7i
After Submission: https://prnt.sc/w4lqd0
I have tried with following code
import requests
from bs4 import BeautifulSoup as bs
resultdata = {
'sr': '3',
'et': '2',
'exam': 'ssc',
'year': 2012,
'board': 'chittagong',
'roll': 102275,
'reg': 626948,
'button2': 'Submit',
}
headers ={
'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'cookie': 'PHPSESSID=24vp2g7ll9utu1p2ob5bniq263; tcount_unique_eb_log=1',
'Origin': 'http://www.educationboardresults.gov.bd',
'Referer': 'http://www.educationboardresults.gov.bd/',
'Request URL': 'http://www.educationboardresults.gov.bd/result.php'
}
with requests.Session() as s:
url = 'http://www.educationboardresults.gov.bd'
r = s.get(url, headers=headers)
soup = bs(r.content,'html5lib')
#Scraping and by passing Captcha
alltable =soup.findAll('td')
captcha = alltable[56].text.split('+')
for digit in captcha:
value_one, value_two = int(captcha[0]), int(captcha[1])
resultdata['value_s'] = value_one+value_two
r=s.post(url, data=resultdata, headers= headers)
While printing r.content it is showing first page's code. I want to scrape the second page.
Thanks in Advance
You are making post requests to the wrong url. Moreover, you are supposed to add the value of two numbers and use the result right next to value_s. If you are using bs4 version 3.7 or later, the following selector will work for you as I've used pseudo css selector. The bottom line is your issue is solved. Try the following:
import requests
from bs4 import BeautifulSoup
link = 'http://www.educationboardresults.gov.bd/'
result_url = 'http://www.educationboardresults.gov.bd/result.php'
resultdata = {
'sr': '3',
'et': '2',
'exam': 'ssc',
'year': 2012,
'board': 'chittagong',
'roll': 102275,
'reg': 626948,
'button2': 'Submit',
}
def get_number(s,link):
r = s.get(link)
soup = BeautifulSoup(r.text,"html5lib")
num = 0
captcha_numbers = soup.select_one("tr:has(> td > #value_s) > td + td").text.split("+")
for i in captcha_numbers:
num+=int(i)
return num
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
resultdata['value_s'] = get_number(s,link)
r = s.post(result_url, data=resultdata)
print(r.text)
I am also trying.
import requests
from bs4 import BeautifulSoup as bs
resultdata = {
'sr': '3',
'et': '2',
'exam': 'ssc',
'year': "2012",
'board': 'chittagong',
'roll': "102275",
'reg': "626948",
}
headers ={
'user-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36',
'cookie': 'PHPSESSID=24vp2g7ll9utu1p2ob5bniq263; tcount_unique_eb_log=1',
'Origin': 'http://www.educationboardresults.gov.bd',
'Referer': 'http://www.educationboardresults.gov.bd/',
'Request URL': 'http://www.educationboardresults.gov.bd/result.php'
}
with requests.Session() as s:
url = 'http://www.educationboardresults.gov.bd/index.php'
r = s.get(url, headers=headers)
soup = bs(r.content,'lxml')
# print(soup.prettify())
#Scraping and by passing Captcha
alltable =soup.findAll('td')
captcha = alltable[56].text.split('+')
print(captcha)
value_one, value_two = int(captcha[0]), int(captcha[1])
print(value_one, value_one)
resultdata['value_s'] = value_one+value_two
resultdata['button2'] = 'Submit'
print(resultdata)
r=s.post("http://www.educationboardresults.gov.bd/result.php", data=resultdata, headers= headers)
soup = bs(r.content, 'lxml')
print(soup.prettify())
Related
For a small project I tried to get past the google consent page to webscrape the prices found by it. However i could not get the to the good stuff. Sofar i tried the following code, as also proposed by Using python requests with google search.
First try:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
#%% get acces
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
"cookie": "CONSENT=YES"}
cookie = {
'set-cookie': 'CONSENT=YES+cb.20211212-16-p1.nl+FX+671'
}
url = 'https://www.google.com/search?q=kitesurf+bar&rlz=1C1CAFA_enNL603NL674&sxsrf=AOaemvJF3BPrjeczkCI1e1YotCSJYcKjug:1641909060258&source=lnms&tbm=shop&sa=X&ved=2ahUKEwiBwqjy66n1AhXO-KQKHUKVBSoQ_AUoAXoECAEQAw'
s = requests.Session()
s.get(url,headers=headers)
purl = 'https://consent.google.com/s'
payload = {
'gl': 'NL',
'm': 'false',
'pc': 'srp',
'continue': 'https://www.google.com/search?q=kitesurf+bar&rlz=1C1CAFA_enNL603NL674&sxsrf=AOaemvJF3BPrjeczkCI1e1YotCSJYcKjug:1641909060258&source=lnms&tbm=shop&sa=X&ved=2ahUKEwiBwqjy66n1AhXO-KQKHUKVBSoQ_AUoAXoECAEQAw',
'ca': 'r',
'x': '6',
'v': 'cb.20211212-16-p1.nl+FX+092',
't': 'ADw3F8jHa3HqOqq133-wOkXCYf4K_r-AIA:1641909071268',
'hl': 'nl',
'src': '1'
}
s.post(purl,params=payload,headers=headers)
page = s.get(url, headers=headers, cookies=cookie)
Second try:
import requests
from bs4 import BeautifulSoup
with requests.Session() as s:
# url = f"https://www.google.com/search?q=fitness+wear"
headers = {
"referer":"referer: https://www.google.com/",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
}
s.post(url, headers=headers)
response = s.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)```
I want to get to variable last element of dictionary (pasted below), it's in another dictionary "offers", and i have no clue how to extract it.
html = s.get(url=url, headers=headers, verify=False, timeout=15)
soup = BeautifulSoup(html.text, 'html.parser')
products = soup.find_all('script', {'type': "application/ld+json"})
{"#context":"http://schema.org","#type":"Product","aggregateRating":{"#type":"AggregateRating","bestRating":5,"ratingValue":"4.8","ratingCount":11,"worstRating":3,"reviewCount":5},"brand":{"#type":"Brand","name":"New Balance"},"color":"white/red/biały","image":["https://img01.ztat.net/3"],"itemCondition":"http://schema.org/NewCondition","manufacturer":"New Balance","name":"550 UNISEX - Sneakersy niskie - white/red","offers":[{"#type":"Offer","availability":"http://schema.org/OutOfStock","price":"489","priceCurrency":"PLN","sku":"NE215O06U-A110001000","url":"/new-balance-550-unisex-sneakersy-niskie-whitered-ne215o06u-a11.html"},{"#type":"Offer","availability":"http://schema.org/OutOfStock","price":"489","priceCurrency":"PLN","sku":"NE215O06U-A110002000","url":"/new-balance-550-unisex-sneakersy-niskie-whitered-ne215o06u-a11.html"} (...)
As mentioned extract contents via BeautifulSoup decode the string with json.loads():
import json
products = '{"#context":"http://schema.org","#type":"Product","aggregateRating":{"#type":"AggregateRating","bestRating":5,"ratingValue":"4.8","ratingCount":11,"worstRating":3,"reviewCount":5},"brand":{"#type":"Brand","name":"New Balance"},"color":"white/red/biały","image":["https://img01.ztat.net/3"],"itemCondition":"http://schema.org/NewCondition","manufacturer":"New Balance","name":"550 UNISEX - Sneakersy niskie - white/red","offers":[{"#type":"Offer","availability":"http://schema.org/OutOfStock","price":"489","priceCurrency":"PLN","sku":"NE215O06U-A110001000","url":"/new-balance-550-unisex-sneakersy-niskie-whitered-ne215o06u-a11.html"},{"#type":"Offer","availability":"http://schema.org/OutOfStock","price":"489","priceCurrency":"PLN","sku":"NE215O06U-A110002000","url":"/new-balance-550-unisex-sneakersy-niskie-whitered-ne215o06u-a11.html"}]}'
products = json.loads(products)
To get the last element (dict) in offers:
products['offers'][-1]
Output:
{'#type': 'Offer',
'availability': 'http://schema.org/OutOfStock',
'price': '489',
'priceCurrency': 'PLN',
'sku': 'NE215O06U-A110002000',
'url': '/new-balance-550-unisex-sneakersy-niskie-whitered-ne215o06u-a11.html'}
Example
In your special case you also have to replace('"','"') first:
from bs4 import BeautifulSoup
import requests, json
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
"X-Amzn-Trace-Id": "Root=1-61acac03-6279b8a6274777eb44d81aae",
"X-Client-Data": "CJW2yQEIpLbJAQjEtskBCKmdygEIuevKAQjr8ssBCOaEzAEItoXMAQjLicwBCKyOzAEI3I7MARiOnssB" }
html = requests.get('https://www.zalando.de/new-balance-550-unisex-sneaker-low-whitered-ne215o06u-a11.html', headers=headers)
soup = BeautifulSoup(html.content, 'lxml')
jsonData = json.loads(soup.select_one('script[type="application/ld+json"]').text.replace('"','"'))
jsonData['offers'][-1]
I'm trying to scrape all the follower names from a profile page using requests module. The problem is when I run the script below, I get the first 20 names over and over again.
The parameters used in post requests only have two keys and values like size:20 and continuation:timestamp. I tried to use the parameters in the right way but still I get the same results repeatedly.
import time
import requests
link = 'https://api-mainnet.rarible.com/marketplace/api/v4/followers'
params = {'user': '0xe744d23107c9c98df5311ff8c1c8637ec3ecf9f3'}
payload = {"size": 20}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'origin': 'https://rarible.com',
'referer': 'https://rarible.com/'
}
with requests.Session() as s:
s.headers.update(headers)
while True:
res = s.post(link,params=params,json=payload)
print(s.headers)
for item in res.json():
print(item['owner'].get('name',''))
payload['continuation'] = f"{int(time.time() * 1000)}"
time.sleep(2)
How can I parse all the follower names from that page using requests?
Your next continuation value is in X-CONTINUATION response header, so this will work when increasing size in payload doesn't:
import requests
link = 'https://api-mainnet.rarible.com/marketplace/api/v4/followers'
params = {'user': '0xe744d23107c9c98df5311ff8c1c8637ec3ecf9f3'}
payload = {"size": 20}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'origin': 'https://rarible.com',
'referer': 'https://rarible.com/'
}
res = requests.post(link, headers=headers, params=params, json=payload)
print(res.headers["X-CONTINUATION"])
while True:
for item in res.json():
print(item['owner'].get('name',))
if not res.headers["X-CONTINUATION"]:
break
payload['continuation'] = res.headers["X-CONTINUATION"]
res = requests.post(link, headers=headers, params=params, json=payload)
some api may block you from extracting values more than certain limit and also may show in pages with limits.
For me just increasing the size payload worked with your code.
import time
import requests
link = 'https://api-mainnet.rarible.com/marketplace/api/v4/followers'
params = {'user': '0xe744d23107c9c98df5311ff8c1c8637ec3ecf9f3'}
payload = {"size": 10000}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'origin': 'https://rarible.com',
'referer': 'https://rarible.com/'
}
with requests.Session() as s:
s.headers.update(headers)
res = s.post(link,params=params,json=payload)
print(len(res.json()))
for item in res.json():
print(item['owner'].get('name',''))
I am trying to scrape data from https://www.seethroughny.net/payrolls/110681345 but the table is difficult to deal with.
I have tried many things.
import pandas as pd
import ssl
import csv
ssl._create_default_https_context = ssl._create_unverified_context
calls_df = pd.read_html("https://www.seethroughny.net/payrolls/110681345", header=0)
print(calls_df)
calls_df.to_csv("calls.csv", index=False)
I would like to parse this into a csv file and I am index matching this with another dataset.
There is a json response containing the html. It seems that something blocks requests at random points in entire all results loop version at end
Single page version where you change the current_page value to the appropriate page number.
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
url = 'https://www.seethroughny.net/tools/required/reports/payroll?action=get'
headers = {
'Accept' : 'application/json, text/javascript, */*; q=0.01' ,
'Content-Type' : 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent' : 'Mozilla/5.0',
'Referer' : 'https://www.seethroughny.net/payrolls/110681'
}
data = {
'PayYear[]' : '2018',
'BranchName[]' : 'Villages',
'SortBy' : 'YTDPay DESC',
'current_page' : '0',
'result_id' : '110687408',
'url' : '/tools/required/reports/payroll?action=get',
'nav_request' : '0'
}
r = requests.post(url, headers = headers, data = data).json()
soup = bs(r['html'], 'lxml')
results = []
for item in soup.select('tr:nth-child(odd)'):
row = [subItem.text for subItem in item.select('td')][1:]
results.append(row)
df = pd.DataFrame(results)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8-sig',index = False )
All pages version (work in progress as currently request can fail to return json at varying points in loop despite delay). Seems improved with #sim's suggestion of swapping out user-agents.
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import time
from requests.packages.urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import random
ua = ['Mozilla/5.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
]
url = 'https://www.seethroughny.net/tools/required/reports/payroll?action=get'
headers = {
'Accept' : 'application/json, text/javascript, */*; q=0.01' ,
'Content-Type' : 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent' : 'Mozilla/5.0',
'Referer' : 'https://www.seethroughny.net/payrolls/110681'
}
data = {
'PayYear[]' : '2018',
'BranchName[]' : 'Villages',
'SortBy' : 'YTDPay DESC',
'current_page' : '0',
'result_id' : '110687408',
'url' : '/tools/required/reports/payroll?action=get',
'nav_request' : '0'
}
results = []
i = 0
with requests.Session() as s:
retries = Retry(total=5,
backoff_factor=0.1,
status_forcelist=[ 500, 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
while len(results) < 1000: #total:
data['current_page'] = i
data['result_id'] = str(int(data['result_id']) + i)
try:
r = s.post(url, headers = headers, data = data).json()
except Exception as e:
print(e)
time.sleep(2)
headers['User-Agent'] = random.choice(ua)
r = s.post(url, headers = headers, data = data).json()
continue
soup = bs(r['html'], 'lxml')
for item in soup.select('tr:nth-child(odd)'):
row = [subItem.text for subItem in item.select('td')][1:]
results.append(row)
i+=1
#Sim's version:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
url = 'https://www.seethroughny.net/tools/required/reports/payroll?action=get'
headers = {
'User-Agent' : 'Mozilla/5.0',
'Referer' : 'https://www.seethroughny.net/payrolls/110681'
}
data = {
'PayYear[]' : '2018',
'BranchName[]' : 'Villages',
'SortBy' : 'YTDPay DESC',
'current_page' : '0',
'result_id' : '110687408',
'url' : '/tools/required/reports/payroll?action=get',
'nav_request' : '0'
}
results = []
i = 0
def get_content(i):
while len(results) < 15908:
print(len(results))
data['current_page'] = i
headers['User-Agent'] = ua.random
try:
r = requests.post(url, headers = headers, data = data).json()
except Exception:
time.sleep(1)
get_content(i)
soup = BeautifulSoup(r['html'], 'lxml')
for item in soup.select('tr:nth-child(odd)'):
row = [subItem.text for subItem in item.select('td')][1:]
results.append(row)
i+=1
if __name__ == '__main__':
ua = UserAgent()
get_content(i)
please help to deal with the authorization through the script. the problem is that it is impossible to automatically insert a get-request invbf_session_id.
import pprint
import requests
import re
import shelve
import bs4
def scanning_posts():
print('------------------------enter begin--------------------')
url = 'http://forum.saransk.ru/'
html = requests.get(url)
pprint.pprint(html.headers)
rawCookie = html.headers['Set-Cookie']
cookie = re.search(r"invbf_session_id=(.*?);", rawCookie).group(1)
pprint.pprint(cookie) # cookie != zzzzzzzzzzzzzzzzzzzzzz
html = requests.get(url)
soup = bs4.BeautifulSoup(html.text)
loginForm = soup.find('form', {'id': 'login'})
hiddenAuthKey = soup.find('input', {'name': 'auth_key'})['value']
authData = {
'ips_username': 'xxxxxx',
'ips_password': 'yyyyyy',
'auth_key': hiddenAuthKey,
'rememberMe': 1,
'referer': 'http://forum.saransk.ru/'
}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.107 Safari/537.36',
'Referer': 'http://forum.saransk.ru/forum/'
}
#pprint.pprint(authData)
print('\tlogin: ', authData['ips_username'])
cookie = dict(invbf_session_id='zzzzzzzzzzzzzzzzzzzzzz')
req = requests.get(url, params=authData, cookies=cookie, headers=header)
soup = bs4.BeautifulSoup(req.text)
signLinkNotLogged = soup.find('a', {'id': 'sign_in'})
if signLinkNotLogged:
print('------------------------enter failed--------------------')
else:
print('------------------------enter successful--------------------')
scanning_posts()
After running the script displays the wrong value invbf_session_id, as seen in FF firebug. respectively, authorization is not obtained.
if the value invbf_session_id copy of FF firebug and paste the script, the authorization is successful