trying to webscrape date of last document on a webpage in python - python

I am trying to get the date as below : 01/19/2021, I would like to get the "19" in a python variable
<span class="grayItalic">
Received: 01/19/2021
</span>
here is the piece of code unworking :
date = soup.find('span', {'class': 'grayItalic'}).get_text()
converted_date = int(date[13:14])
print(date)
I get this error : 'NoneType' object has no attribute 'get_text'
anyone could help ?

Try this with headers:
import requests
from bs4 import BeautifulSoup
url = "https://iapps.courts.state.ny.us/nyscef/DocumentList?docketId=npvulMdOYzFDYIAomW_PLUS_elw==&display=all"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content,'html.parser')
date = soup.find('span', {'class': 'grayItalic'}).get_text().strip()
converted_date = int(date.split("/")[-2])
print(converted_date)
print(date)

import dateutil.parser
from bs4 import BeautifulSoup
html_doc=""""<span class="grayItalic">
Received: 01/19/2021
</span>"""
soup=BeautifulSoup(html_doc,'html.parser')
date_ = soup.find('span', {'class': 'grayItalic'}).get_text()
dateutil.parser.parse(date_,fuzzy=True)
Output:
datetime.datetime(2021, 1, 19, 0, 0)
date_ outputs '\n Received: 01/19/2021\n' You've string slicing ,instead you can use \[dateutil.parser\]. Which will return a datetime.datetime object for you.
In this case i've assumed you just need the date.If at all you need the text too,you can use fuzzy_with_tokens=True.
if the fuzzy_with_tokens option is True, returns a tuple, the first element being a datetime.datetime object, the second a tuple containing the fuzzy tokens.
dateutil.parser.parse(date_,fuzzy_with_tokens=True)
(datetime.datetime(2021, 1, 19, 0, 0), (' Received: ', ' '))

I couldn't load the URL using the request or urllib module. I guess the website is blocking automatic connection requests. So I opened the webpage and save the source code in a file name page.html and ran the BeautifulSoup operation in it. And it seems that worked.
html = open("page.html")
soup = BeautifulSoup(html, 'html.parser')
date_span = soup.find('span', {'class': 'grayItalic'})
if date_span is not None:
print(str(date_span.text).strip().replace("Received: ", ""))
# output: 04/25/2019
I tried scraping the source code with the request library as follows but it didn't work (probably the webpage is blocking the request). See if it works on your machine.
url = "..."
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
response = requests.get(url, headers=headers)
html = response.content
print(html)

Related

How To Extract Data Within a Javascript Tag Using Python's BeautifulSoup

I am trying to pull the data that proceeds 'series: ', as shown below.
}
},
series: [{ name: '', showInLegend: false, animation: false, color: '#c84329', lineWidth: 2, data: [[1640926800000,164243],[1638248400000,224192],[1635566400000,143606],[1632974400000,208461],[1630382400000,85036],[1627704000000,25604],[1625025600000,44012],[1622433600000,111099],[1619755200000,53928],[1617163200000,12286],[1614488400000,12622],[1612069200000,4519],[1609390800000,12665],[1606712400000,314],[1604116800000,3032],[1601438400000,4164],[1598846400000,3302],[1596168000000,22133],[1593489600000,8098],[1590897600000,-1385],[1588219200000,43165],[1585627200000,427],[1582952400000,175],[1580446800000,174],[1577768400000,116],[1575090000000,196],[1572494400000,215],[1569816000000,418],[1567224000000,375],[1564545600000,375],[1561867200000,179],[1559275200000,132],[1556596800000,146],[1554004800000,163],[1551330000000,3],[1548910800000,49],[1546232400000,-29],[1543381200000,108],[1540958400000,35],[1538280000000,159],[1535688000000,287],[1533009600000,1152],[1530331200000,1306]] }],
navigation: { menuItemStyle: { fontSize: '9px' } }
});
More specifically, I'm trying to pull data, which has a list of unix timestamps and ints. This is what I have so far...
url = "https://socialblade.com/twitter/user/twitter"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
soup = bs(response.read(), 'html.parser')
soup = soup.find_all('script', {"type": "text/javascript"})
script = soup[6].text
Any thoughts?
The datatype of the script is a string so we can use the "re" module to find all occurrences of "data" in the script and then we can observe that every data in the script ends with "}" so we can find out the first "}" after data now using the index of the start of "data" substring and index of first "}" after data we can use string slicing to find out the data. you can see the code and output given below.
import re
sub = "data"
res = re.finditer(sub, script)
for i in res:
k = script.find("}",i.start())
print(script[i.start():k])
Output is:
Complete script for your required data
import requests
from bs4 import BeautifulSoup
url = "https://socialblade.com/twitter/user/twitter"
s = requests.Session()
r = requests.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
},
)
soup = BeautifulSoup(r.text, "html.parser")
req = soup.find_all("script", {"type": "text/javascript"})
script = req[6].contents[0]
data = script[2447: 3873]
print(data)

How i can get href from row

I do some telegram bot, and i need to get links from html.
I want to take href for Matches from this website https://www.hltv.org/matches
My previous code is
elif message.text == "Matches":
url_news = "https://www.hltv.org/matches"
response = requests.get(url_news)
soup = BeautifulSoup(response.content, "html.parser")
match_info = []
match_items = soup.find("div", class_="upcomingMatchesSection")
print(match_items)
for item in match_items:
match_info.append({
"link": item.find("div", class_="upcomingMatch").text,
"title": item["href"]
})
And i dont know how i can get links from this body.Appreciate any help
What happens?
You try to iterate over match_items but there is nothing to iterate, cause you only selected the section including the matches but not the matches itself.
How to fix?
Select the upcomingMatches instead and iterate over them:
match_items = soup.select("div.upcomingMatchesSection div.upcomingMatch")
Getting the url you have to select an <a>:
item.a["href"]
Example
from bs4 import BeautifulSoup as bs
import requests
url_news = "https://www.hltv.org/matches"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get(url_news, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
match_info = []
match_items = soup.select("div.upcomingMatchesSection div.upcomingMatch")
for item in match_items:
match_info.append({
"title": item.get_text('|', strip=True),
"link": item.a["href"]
})
match_info
Output
[{'title': '09:00|bo3|1WIN|K23|Pinnacle Fall Series 2|Odds',
'link': '/matches/2352066/1win-vs-k23-pinnacle-fall-series-2'},
{'title': '09:00|bo3|INDE IRAE|Nemiga|Pinnacle Fall Series 2|Odds',
'link': '/matches/2352067/inde-irae-vs-nemiga-pinnacle-fall-series-2'},
{'title': '10:00|bo3|OPAA|Nexus|Malta Vibes Knockout Series 3|Odds',
'link': '/matches/2352207/opaa-vs-nexus-malta-vibes-knockout-series-3'},
{'title': '11:00|bo3|Checkmate|TBC|Funspark ULTI 2021 Asia Regional Series 3|Odds',
'link': '/matches/2352092/checkmate-vs-tbc-funspark-ulti-2021-asia-regional-series-3'},
{'title': '11:00|bo3|ORDER|Alke|ESEA Premier Season 38 Australia|Odds',
'link': '/matches/2352122/order-vs-alke-esea-premier-season-38-australia'},...]
You can try this out.
All the match information is present inside a <div> with classname as upcomingMatch
Select all those <div> and from each <div>, extract the match link which is present inside the <a> tag with class name as match.
Here is the code:
import requests
from bs4 import BeautifulSoup
url_news = "https://www.hltv.org/matches"
headers = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
response = requests.get(url_news,headers=headers)
soup = BeautifulSoup(response.text, "lxml")
match_items = soup.find_all("div", class_="upcomingMatch")
for match in match_items:
link = match.find('a', class_='match a-reset')['href']
print(f'Link: {link}')
Link: /matches/2352235/malta-vibes-knockout-series-3-quarter-final-1-malta-vibes-knockout-series-3
Link: /matches/2352098/pinnacle-fall-series-2-quarter-final-2-pinnacle-fall-series-2
Link: /matches/2352236/malta-vibes-knockout-series-3-quarter-final-2-malta-vibes-knockout-series-3
Link: /matches/2352099/pinnacle-fall-series-2-quarter-final-3-pinnacle-fall-series-2
.
.
.

How to scrape data from Amazon Canada?

I am trying to scrape data from amazon canada (amazon.ca). I am using requests and bs4 package to send & parse html data. I am not able to extract the data from the response. Can someone please help me in extracting information from response.
import requests
from bs4 import BeautifulSoup
# Define headers
headers={
'content-type': 'text/html;charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
# Amazon Canada product url
url = 'https://www.amazon.ca/INIU-High-Speed-Flashlight-Powerbank-Compatible/dp/B07CZDXDG8?ref_=Oct_s9_apbd_otopr_hd_bw_b3giFrP&pf_rd_r=69GE1K9DG49351YHSYBC&pf_rd_p=694b8fdf-0d96-57ba-b834-dc9bdeb7a094&pf_rd_s=merchandised-search-11&pf_rd_t=BROWSE&pf_rd_i=3379552011&th=1'
resp = requests.get(url,headers= header)
print(resp)
<Response [200]>
Earlier it was showing <Response [503]>, so I added headers, now it is showing <Response [200]>. So I am trying to extract some information from the page.
# Using html parser
soup = BeautifulSoup(resp.content,'lxml')
# Extracting information from page
product_title = soup.find('span',id='productTitle')
print('product_title -' ,product_title)
product_price = soup.find('span',id='priceblock_ourprice')
print('product_price -' ,product_price)
('product_title -', None)
('product_price -', None)
But it is showing None, So I checked what exactly data is present in soup. So I print the soup.
soup.text
'\n\n\n\nRobot Check\n\n\n\n\nif (true === true) {\n var ue_t0 = (+
new Date()),\n ue_csm = window,\n ue = { t0: ue_t0, d:
function() { return (+new Date() - ue_t0); } },\n ue_furl =
"fls-na.amazon.ca",\n ue_mid = "A2EUQ1WTGCTBG2",\n
ue_sid = (document.cookie.match(/session-id=([0-9-]+)/) || [])[1],\n
ue_sn = "opfcaptcha.amazon.ca",\n ue_id =
\'0B2HQATTKET8J6M36Y3G\';\n}\n\n\n\n\n\n\n\n\n\n\n\nEnter the
characters you see below\nSorry, we just need to make sure you\'re not
a robot. For best results, please make sure your browser is accepting
cookies.\n\n\n\n\n\n\n\n\n\n\nType the characters you see in this
image:\n\n\n\n\n\n\n\n\nTry different
image\n\n\n\n\n\n\n\n\n\n\n\nContinue
shopping\n\n\n\n\n\n\n\n\n\n\n\nConditions of Use &
Sale\n\n\n\n\nPrivacy Notice\n\n\n \xa9 1996-2015,
Amazon.com, Inc. or its affiliates\n \n if (true
=== true) {\n document.write(\'<img src="https://fls-na.amaz\'+\'on.ca/\'+\'1/oc-csi/1/OP/requestId=0B2HQATTKET8J6M36Y3G&js=1"
/>\');\n };\n \n\n\n\n\n\n\n if (true === true)
{\n var head = document.getElementsByTagName(\'head\')[0],\n
prefix =
"https://images-na.ssl-images-amazon.com/images/G/01/csminstrumentation/",\n
elem = document.createElement("script");\n elem.src = prefix +
"csm-captcha-instrumentation.min.js";\n
head.appendChild(elem);\n\n elem =
document.createElement("script");\n elem.src = prefix +
"rd-script-6d68177fa6061598e9509dc4b5bdd08d.js";\n
head.appendChild(elem);\n }\n \n\n'
I checked the output throughly, but I didn't found any data available in the response, I even tried to do the same and checked in resp.content, but didn't found any data. Also I validated the url, the url is valid too. I even tested above script by adding public proxies, but still no output.
Can someone please help me extract information from the url or any other way to get it done?.
Try this:
import requests
from bs4 import BeautifulSoup
headers = {
'content-type': 'text/html;charset=UTF-8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
}
url = 'https://www.amazon.ca/INIU-High-Speed-Flashlight-Powerbank-Compatible/dp/B07CZDXDG8'
resp = requests.get(url, headers=headers)
soup = BeautifulSoup(resp.content, 'lxml')
# Extracting information from page
print('product_title -', soup.find('span', id='productTitle').text.strip())
print('product_price -', soup.find('span', id='priceblock_ourprice').text.strip())
The code yields:
product_title - INIU Power Bank, Ultra-Slim Dual 3A High-Speed Portable Charger, 10000mAh USB C Input & Flashlight External Phone Battery Pack for iPhone Xs X 8 Plus Samsung S10 Google LG iPad etc. [2020 Upgrade]
product_price - CDN$ 60.66

SERP Scraping with Beautiful Soup

I'm trying to build a simple script to scrape Google's first Search Results Page and export the results in .csv.
I managed to get URLs and Titles, but I cannot retrieve Descriptions.
I have been using the following code:
import urllib
import requests
from bs4 import BeautifulSoup
# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
query = "pizza recipe"
query = query.replace(' ', '+')
URL = f"https://google.com/search?q={query}"
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_='r'):
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
title = g.find('h3').text
desc = g.select('span')
description = g.find('span',{'class':'st'}).text
item = {
"title": title,
"link": link,
"description": description
}
results.append(item)
import pandas as pd
df = pd.DataFrame(results)
df.to_excel("Export.xlsx")
I get the following message when I run the code:
description = g.find('span',{'class':'st'}).text
AttributeError: 'NoneType' object has no attribute 'text'
Essentially, the field is empty.
Can somebody please help me this line so that I can get all the information from the snippet?
It's not within the div class="r". It's under div class="s"
So change to this for description:
description = g.find_next_sibling("div", class_='s').find('span',{'class':'st'}).text
From the current element, it'll find the next div, with class="s". Then you can pull out the <span> tag
Try to use select_one() or select() bs4 methods. They're more flexible and easy to read. CSS selectors reference.
Also, you can pass URL params since requests do everything for you like so:
# instead of this:
query = "pizza recipe"
query = query.replace(' ', '+')
URL = f"https://google.com/search?q={query}"
# try to use this:
params = {
'q': 'fus ro dah', # query
'hl': 'en'
}
requests.get('URL', params=params)
If you want to write to .csv then you need to use .to_csv() rather than .to_excel()
If you want to get rid of pandas index column, then you can pass index=False, e.g df.to_csv('FILE_NAME', index=False)
Code and example in the online IDE:
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
"User-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
'q': 'fus ro dah', # query
'hl': 'en'
}
resp = requests.get("https://google.com/search", headers=headers, params=params)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, "html.parser")
results = []
for result in soup.select('.tF2Cxc'):
title = result.select_one('.DKV0Md').text
link = result.select_one('.yuRUbf a')['href']
snippet = result.select_one('#rso .lyLwlc').text
item = {
"title": title,
"link": link,
"description": snippet
}
results.append(item)
df = pd.DataFrame(results)
df.to_csv("BS4_Export.csv", index=False)
Alternatively, you can do the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't need to figure out what selectors to use and why they don't work although they should since it's already done for the end-user.
Code to integrate:
from serpapi import GoogleSearch
import os
import pandas as pd
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "fus ro dah",
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
data = []
for result in results['organic_results']:
title = result['title']
link = result['link']
snippet = result['snippet']
data.append({
"title": title,
"link": link,
"snippet": snippet
})
df = pd.DataFrame(results)
df.to_csv("SerpApi_Export.csv", index=False)
P.S - I wrote a bit more detailed blog post about how to scrape Google Organic Results.
Disclaimer, I work for SerpApi.

Scraping a website with clickable content in Python

I would like to scrap the content a the following website:
http://financials.morningstar.com/ratios/r.html?t=AMD
In there under Key Ratios I would like to click on "Growth" button and then scrap the data in Python.
How can I do that?
You can solve it with requests+BeautifulSoup. There is an asynchronous GET request sent to the http://financials.morningstar.com/financials/getKeyStatPart.html endpoint which you need to simulate. The Growth table is located inside the div with id="tab-growth":
from bs4 import BeautifulSoup
import requests
url = 'http://financials.morningstar.com/ratios/r.html?t=AMD'
keystat_url = 'http://financials.morningstar.com/financials/getKeyStatPart.html'
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'}
# visit the target url
session.get(url)
params = {
'callback': '',
't': 'XNAS:AMD',
'region': 'usa',
'culture': 'en-US',
'cur': '',
'order': 'asc',
'_': '1426047023943'
}
response = session.get(keystat_url, params=params)
# get the HTML part from the JSON response
soup = BeautifulSoup(response.json()['componentData'])
# grab the data
for row in soup.select('div#tab-growth table tr'):
print row.text

Categories