Web scraping of website which is reloading on option selection - python

So i am trying scrap CPI report from indian govt website.
here is website https://fcainfoweb.nic.in/pmsver2/reports/report_menu_web.aspx ,
I am using this approach,
When we load this website it asks for multiple options to select. after selecting options and then hitting the get data button, we are redirected to report page.
Here, i copied my cookie and session details,which i used in below python script to retrieve information. which is working fine.
Now, i want to fully automate this task, which will require
Price report -> Daily prices
date selection
getting data in code ,
but the issue is, web pages are redirected and even options on selectors are changing, how do i scrap this ?
i have below script where i've given prefecthed cookie & session as param & able to get data.
import requests
#from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import lxml.html as lh
import pandas as pd
from pprint import pprint
# https://fcainfoweb.nic.in/reports/Report_Menu_Web.aspx
# report link = https://fcainfoweb.nic.in/reports/Report_daily1_Web_New.aspx
#url = 'https://fcainfoweb.nic.in/reports/Report_daily1_Web_New.aspx'
#url = 'https://fcainfoweb.nic.in/reports/Reportdaily9.aspx'
# "Cookie": "ASP.NET_SessionId=n3npgkgb2wpy3sup45ze024y; BNI_persistence=XIlVKPHMyFvRq0HtLj7pmqXxmRx7y7byO_ia3T0PrBLraaAiDz2RxPPPWpXCo2y2SGMfsbBJx4Pe4wWpm_C-OA=="}
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
#ua = UserAgent()9
#"Cookie":"ASP.NET_SessionId=dkk2h2003kzfamcypczrfaru; BNI_persistence=XIlVKPHMyFvRq0HtLj7pmqXxmRx7y7byO_ia3T0PrBLraaAiDz2RxPPPWpXCo2y2SGMfsbBJx4Pe4wWpm_C-OA==; _ga=GA1.3.654717034.1651138144; _gid=GA1.3.1558736990.1651468427; _gat_gtag_UA_106490103_3=1"
#res = requests.get('https://fcainfoweb.nic.in/reports/Daily_Average_Report_Data_Commoditywise_Percentage_Variation.aspx',headers=head)
head = {'User-Agent': 'Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36',
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Cookie": "ASP.NET_SessionId=n3npgkgb2wpy3sup45ze024y; BNI_persistence=XIlVKPHMyFvRq0HtLj7pmqXxmRx7y7byO_ia3T0PrBLraaAiDz2RxPPPWpXCo2y2SGMfsbBJx4Pe4wWpm_C-OA=="}
u = '''https://fcainfoweb.nic.in/Reports/Report_Menu_Web.aspx'''
res = requests.get(u,headers=head)
print(res.headers)
print(res.text)
print(res.cookies)
with open('resp.html','w') as f:
f.writelines(res.text)
soup = BeautifulSoup(res.text, 'lxml')
#pprint(soup)
tab = soup.find_all('table')
cnt = 1
htab = pd.read_html(res.text)[1]
fn = "data_{0}.xlsx".format(cnt)
htab.to_excel(fn)

Related

Moving to the next page using BeautifulSoup for scraping

I would need to scrape the content (just titles) from a website. I did it for one page, but I would need to do it for all the pages on the website.
Currently, I am doing as follows:
import bs4, requests
import pandas as pd
import re
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
r = requests.get(website, headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html')
title=soup.find_all('h2')
I know that, when I move to the next page, the url changes as follows:
website/page/2/
website/page/3/
...
website/page/49/
...
I tried to build a recursive function using next_page_url = base_url + next_page_partial but it does not move to the next page.
if soup.find("span", text=re.compile("Next")):
page = "https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina/page/".format(page_num)
page_num +=10 # I should scrape all the pages so maybe this number should be changed as I do not know at the beginning how many pages there are for that section
print(page_num)
else:
break
I followed this question (and answer): Moving to next page for scraping using BeautifulSoup
Please let me know if you need more info. Many thanks
Updated code:
import bs4, requests
import pandas as pd
import re
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page_num=1
website="https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina"
while True:
r = requests.get(website, headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html')
title=soup.find_all('h2')
if soup.find("span", text=re.compile("Next")):
page = f"https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina/page/{page_num}".format(page_num)
page_num +=10
else:
break
If you use f"url/{page_num}" then remove format(page_num).
You can use anything you want below:
page = f"https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina/page/{page_num}"
or
page = "https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina/page/{}".format(page_num)
Good luck!
Final answer will be this:
import bs4, requests
import pandas as pd
import re
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page_num=1
website="https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina"
while True:
r = requests.get(website, headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html')
title=soup.find_all('h2')
if soup.find("span", text=re.compile("Next")):
website = f"https://catania.liveuniversity.it/notizie-catania-cronaca/cronacacatenesesicilina/page/{page_num}"
page_num +=1
else:
break

Python 3 urllib library not returning the same HTML as inspected on Chrome

So I'm trying to extract the current EUR/USD price from a website using Python urllib but the website does not send the same HTML it sends to Chrome. The first part of the HTML is the same as on Chrome but it does not want to give me the EUR/USD value. Can I somehow bypass this?
Here's the code:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
while True:
req = Request('https://www.strategystocks.co.uk/currencies-market.html', headers={"User-Agent":'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
html = urlopen(req).read()
soup = BeautifulSoup(html, "html.parser")
print(soup)
buy = int(soup.find("span", class_="buyPrice").text)
sell = int(soup.find("span", class_="sellPrice").text)
print("Buy", buy)
print("Sell", sell)
The data is loaded via Javascript, but you can simulate the Ajax request with requests library:
import requests
url = 'https://marketools.plus500.com/Feeds/UpdateTable?instsIds=2&isUseSentiments=true'
headers = {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:75.0) Gecko/20100101 Firefox/75.0'}
data = requests.get(url, headers=headers).json()
# print(data) # <-- uncomment this to print all data
print('Buy =',data['Feeds'][0]['B'])
print('Sell =',data['Feeds'][0]['S'])
Prints:
Buy = 1.08411
Sell = 1.08403

User-agent error with web scraping python3

It is my first time using web scraping. When I am using page = requests.get(URL) it works perfectly fine but when I am adding
headers = {"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15'}
page = requests.get(URL, headers=headers)
I am getting an error
title = soup.find(id="productTitle").get_text()
AttributeError: 'NoneType' object has no attribute 'get_text'
What's wrong with that? Should I resign with headers?
I think the page contains non valid HTML and therefore BeatifulSoup is not able to find your element.
Try to prettify the HTML first:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/dp/B07JP9QJ15/ref=dp_cerb_1'
headers = {
"User-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Safari/605.1.15'}
page = requests.get(URL, headers=headers)
pretty = BeautifulSoup(page.text,'html.parser').prettify()
soup = BeautifulSoup(pretty,'html.parser')
print(soup.find(id='productTitle').get_text())
Which returns:
Dell UltraSharp U2719D - LED Monitor - 27"

How do I only receive new links from a page monitor created in Python?

I've written a page monitor to receive the latest product link from Nike.com, but I only want it to return a link to me if it's from a product that has just been uploaded to the site. I haven't been able to find any help similar to this. This is the page monitor written in Python. Any help with Python returning only new links would be helpful.
import requests
from bs4 import BeautifulSoup
import time
import json
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
def item_finder():
source = requests.get('https://www.nike.com/launch/', headers=headers).text
soup = BeautifulSoup(source, 'lxml')
card = soup.find('figure', class_='item ncss-col-sm-12 ncss-col-md-6 ncss-col-lg-4 va-sm-t pb2-sm pb4-md prl0-sm prl2-md ')
card_data = "https://nike.com" + card.a.get('href')
print(card_data)
item_finder()

Content missing in html file scraping with BeautifulSoup

i am trying to scrape the content of this site: http://www.whoscored.com/Matches/824609/Live . If i view the html-file in Chrome under the network tab am i able to see everything i want to scrape. But if i run the script below is a lot of data is missing in the results, data in JSON format. Its data of every specific event during the game. Why is the result different when i inspect the content in chrome and when i scrape the site?
import requests
from bs4 import BeautifulSoup
url = 'http://www.whoscored.com/Matches/824609/Live'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text)
print soup

Categories