I am trying to scrape data (just need the url for each Lego set's page) from https://www.brickeconomy.com/sets/theme/collectable-minifigures but there is pagination on the website using Javascript __doPostBack function. I looked at many other relevant answers to know that I need to look into the POST request to identify the request form data as seen here:
Screenshot of Request's Form Data
My code is as follows now:
import requests
from bs4 import BeautifulSoup
url = "http://www.brickeconomy.com/sets/theme/collectable-minifigures"
page_content = requests.get(url).content
soup = BeautifulSoup(page_content, 'html.parser')
VIEWSTATEGENERATOR = soup.find('input',{'id':'__VIEWSTATEGENERATOR'}).get('value')
VIEWSTATE = soup.find('input',{'id':'__VIEWSTATE'}).get('value')
headers = {'user-agent': 'Mozilla/5.0'}
data = {
"ctl00$ScriptManager1": "ctl00$ContentPlaceHolder1$ctlSets$UpdatePanelMain|ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
"ctl00$txtSearchHeader2": "",
"ctl00$txtSearchHeader": "",
"subthemesorter": "",
"setsorter": "SetNumberDESC",
"ctl00$LoginModalUsername": "",
"ctl00$LoginModalPassword": "",
"__EVENTTARGET": "ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
"__EVENTARGUMENT": "Page$2",
"__VIEWSTATE":VIEWSTATE,
"__VIEWSTATEGENERATOR": VIEWSTATEGENERATOR,
"__ASYNCPOST": 'true'
}
res = requests.post(url, data=data, headers =headers).content
BeautifulSoup(res, 'html.parser').find_all(class_ = 'mb-5')
However, it is still showing the data from the first page. Would appreciate any advice here. Thank you!
You were sending post requests to the wrong url. Once I've replaced your existing url with the correct one, the script started to work:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
base = 'https://www.brickeconomy.com'
start_url = 'http://www.brickeconomy.com/sets/theme/collectable-minifigures'
post_url = 'https://www.brickeconomy.com/sets/theme/sets/theme/collectable-minifigures'
data = {
"ctl00$ScriptManager1": "ctl00$ContentPlaceHolder1$ctlSets$UpdatePanelMain|ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
"ctl00$txtSearchHeader2": "",
"ctl00$txtSearchHeader": "",
"subthemesorter": "",
"setsorter": "SetNumberDESC",
"ctl00$LoginModalUsername": "",
"ctl00$LoginModalPassword": "",
"__EVENTTARGET": "ctl00$ContentPlaceHolder1$ctlSets$GridViewSets",
"__EVENTARGUMENT": "Page$2",
"__ASYNCPOST": 'true'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
r = s.get(start_url)
soup = BeautifulSoup(r.text,"lxml")
data['__VIEWSTATE'] = soup.find('input',{'id':'__VIEWSTATE'}).get('value')
data['__VIEWSTATEGENERATOR'] = soup.find('input',{'id':'__VIEWSTATEGENERATOR'}).get('value')
res = s.post(post_url,data=data)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select("table.table > tr h4 > a"):
inner_url = urljoin(base,item.get("href"))
print(inner_url)
Related
I am trying to pull the data that proceeds 'series: ', as shown below.
}
},
series: [{ name: '', showInLegend: false, animation: false, color: '#c84329', lineWidth: 2, data: [[1640926800000,164243],[1638248400000,224192],[1635566400000,143606],[1632974400000,208461],[1630382400000,85036],[1627704000000,25604],[1625025600000,44012],[1622433600000,111099],[1619755200000,53928],[1617163200000,12286],[1614488400000,12622],[1612069200000,4519],[1609390800000,12665],[1606712400000,314],[1604116800000,3032],[1601438400000,4164],[1598846400000,3302],[1596168000000,22133],[1593489600000,8098],[1590897600000,-1385],[1588219200000,43165],[1585627200000,427],[1582952400000,175],[1580446800000,174],[1577768400000,116],[1575090000000,196],[1572494400000,215],[1569816000000,418],[1567224000000,375],[1564545600000,375],[1561867200000,179],[1559275200000,132],[1556596800000,146],[1554004800000,163],[1551330000000,3],[1548910800000,49],[1546232400000,-29],[1543381200000,108],[1540958400000,35],[1538280000000,159],[1535688000000,287],[1533009600000,1152],[1530331200000,1306]] }],
navigation: { menuItemStyle: { fontSize: '9px' } }
});
More specifically, I'm trying to pull data, which has a list of unix timestamps and ints. This is what I have so far...
url = "https://socialblade.com/twitter/user/twitter"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0'}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
soup = bs(response.read(), 'html.parser')
soup = soup.find_all('script', {"type": "text/javascript"})
script = soup[6].text
Any thoughts?
The datatype of the script is a string so we can use the "re" module to find all occurrences of "data" in the script and then we can observe that every data in the script ends with "}" so we can find out the first "}" after data now using the index of the start of "data" substring and index of first "}" after data we can use string slicing to find out the data. you can see the code and output given below.
import re
sub = "data"
res = re.finditer(sub, script)
for i in res:
k = script.find("}",i.start())
print(script[i.start():k])
Output is:
Complete script for your required data
import requests
from bs4 import BeautifulSoup
url = "https://socialblade.com/twitter/user/twitter"
s = requests.Session()
r = requests.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
},
)
soup = BeautifulSoup(r.text, "html.parser")
req = soup.find_all("script", {"type": "text/javascript"})
script = req[6].contents[0]
data = script[2447: 3873]
print(data)
I want to get the Verify href from GmailnatorInbox and this site contains the href discord verify which is the following Discord Verify HREF
I want to get this href using bs4 and pass it into a selenium driver link like driver.get(url) the url being the href ofc.
Can someone make some code to scrape the href from the gmailnator inbox please? I did try the page source however the page source does not contain the href.
This is the code I have written to get the href but the href that I require (discord one) is in a frame source so I think that's why it doesnt come up.
UPDATE! EVERYTHING IS DONE AND FIXED
driver.get('https://www.gmailnator.com/inbox/#for.ev.e.r.my.girlt.m.p#gmail.com')
time.sleep(6)
driver.find_element_by_xpath('//*[#id="mailList"]/tbody/tr[2]/td/a/table/tbody/tr/td[1]').click()
time.sleep(4)
url = driver.current_url
email_for_data = driver.current_url.split('/')[-3]
print(url)
time.sleep(2)
print('Getting Your Discord Verify link')
print('Time To Get Your Discord Link')
soup = BeautifulSoup(requests.get(url).text, "lxml")
data_email = soup.find("")
token = soup.find("meta", {"name": "csrf-token"})["content"]
cf_email = soup.find("a", class_="__cf_email__")["data-cfemail"]
endpoint = "https://www.gmailnator.com/mailbox/get_single_message/"
data = {
"csrf_gmailnator_token": token,
"action": "get_message",
"message_id": url.split("#")[-1],
"email": f"{email_for_data}",
}
headers = {
"referer": f"https://www.gmailnator.com/{email_for_data}/messageid/",
"cookie": f"csrf_gmailnator_cookie={token}; ci_session={cf_email}",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.86 "
"YaBrowser/21.3.0.740 Yowser/2.5 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
r = requests.post(endpoint, data=data, headers=headers)
the_real_slim_shady = (
BeautifulSoup(r.json()["content"], "lxml")
.find_all("a", {"target": "_blank"})[1]["href"]
)
print(the_real_slim_shady)
You can fake it all with pure requests to get the Verify link. First, you need to get the token and the cf_email values. Then, things are pretty straightforward.
Here's how to get the link:
import requests
from bs4 import BeautifulSoup
url = "https://www.gmailnator.com/geralddoreyestmp/messageid/#179b454b4c482c4d"
soup = BeautifulSoup(requests.get(url).text, "lxml")
token = soup.find("meta", {"name": "csrf-token"})["content"]
cf_email = soup.find("a", class_="__cf_email__")["data-cfemail"]
endpoint = "https://www.gmailnator.com/mailbox/get_single_message/"
data = {
"csrf_gmailnator_token": token,
"action": "get_message",
"message_id": url.split("#")[-1],
"email": "geralddoreyestmp",
}
headers = {
"referer": "https://www.gmailnator.com/geralddoreyestmp/messageid/",
"cookie": f"csrf_gmailnator_cookie={token}; ci_session={cf_email}",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.86 "
"YaBrowser/21.3.0.740 Yowser/2.5 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
r = requests.post(endpoint, data=data, headers=headers)
the_real_slim_shady = (
BeautifulSoup(r.json()["content"], "lxml")
.find_all("a", {"target": "_blank"})[1]["href"]
)
print(the_real_slim_shady)
Output (your link will be different!):
https://click.discord.com/ls/click?upn=qDOo8cnwIoKzt0aLL1cBeARJoBrGSa2vu41A5vK-2B4us-3D77CR_3Tswyie9C2vHlXKXm6tJrQwhGg-2FvQ76GD2o0Zl2plCYHULNsKdCuB6s-2BHk1oNirSuR8goxCccVgwsQHdq1YYeGQki4wtPdDA3zi661IJL7H0cOYMH0IJ0t3sgrvr2oMX-2BJBA-2BWZzY42AwgjdQ-2BMAN9Y5ctocPNK-2FUQLxf6HQusMayIeATMiTO-2BlpDytu-2FnIW4axB32RYQpxPGO-2BeHtcSj7a7QeZmqK-2B-2FYkKA4dl5q8I-3D
I am trying to get the date as below : 01/19/2021, I would like to get the "19" in a python variable
<span class="grayItalic">
Received: 01/19/2021
</span>
here is the piece of code unworking :
date = soup.find('span', {'class': 'grayItalic'}).get_text()
converted_date = int(date[13:14])
print(date)
I get this error : 'NoneType' object has no attribute 'get_text'
anyone could help ?
Try this with headers:
import requests
from bs4 import BeautifulSoup
url = "https://iapps.courts.state.ny.us/nyscef/DocumentList?docketId=npvulMdOYzFDYIAomW_PLUS_elw==&display=all"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content,'html.parser')
date = soup.find('span', {'class': 'grayItalic'}).get_text().strip()
converted_date = int(date.split("/")[-2])
print(converted_date)
print(date)
import dateutil.parser
from bs4 import BeautifulSoup
html_doc=""""<span class="grayItalic">
Received: 01/19/2021
</span>"""
soup=BeautifulSoup(html_doc,'html.parser')
date_ = soup.find('span', {'class': 'grayItalic'}).get_text()
dateutil.parser.parse(date_,fuzzy=True)
Output:
datetime.datetime(2021, 1, 19, 0, 0)
date_ outputs '\n Received: 01/19/2021\n' You've string slicing ,instead you can use \[dateutil.parser\]. Which will return a datetime.datetime object for you.
In this case i've assumed you just need the date.If at all you need the text too,you can use fuzzy_with_tokens=True.
if the fuzzy_with_tokens option is True, returns a tuple, the first element being a datetime.datetime object, the second a tuple containing the fuzzy tokens.
dateutil.parser.parse(date_,fuzzy_with_tokens=True)
(datetime.datetime(2021, 1, 19, 0, 0), (' Received: ', ' '))
I couldn't load the URL using the request or urllib module. I guess the website is blocking automatic connection requests. So I opened the webpage and save the source code in a file name page.html and ran the BeautifulSoup operation in it. And it seems that worked.
html = open("page.html")
soup = BeautifulSoup(html, 'html.parser')
date_span = soup.find('span', {'class': 'grayItalic'})
if date_span is not None:
print(str(date_span.text).strip().replace("Received: ", ""))
# output: 04/25/2019
I tried scraping the source code with the request library as follows but it didn't work (probably the webpage is blocking the request). See if it works on your machine.
url = "..."
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
response = requests.get(url, headers=headers)
html = response.content
print(html)
I'm trying to build a simple script to scrape Google's first Search Results Page and export the results in .csv.
I managed to get URLs and Titles, but I cannot retrieve Descriptions.
I have been using the following code:
import urllib
import requests
from bs4 import BeautifulSoup
# desktop user-agent
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
query = "pizza recipe"
query = query.replace(' ', '+')
URL = f"https://google.com/search?q={query}"
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
results = []
for g in soup.find_all('div', class_='r'):
anchors = g.find_all('a')
if anchors:
link = anchors[0]['href']
title = g.find('h3').text
desc = g.select('span')
description = g.find('span',{'class':'st'}).text
item = {
"title": title,
"link": link,
"description": description
}
results.append(item)
import pandas as pd
df = pd.DataFrame(results)
df.to_excel("Export.xlsx")
I get the following message when I run the code:
description = g.find('span',{'class':'st'}).text
AttributeError: 'NoneType' object has no attribute 'text'
Essentially, the field is empty.
Can somebody please help me this line so that I can get all the information from the snippet?
It's not within the div class="r". It's under div class="s"
So change to this for description:
description = g.find_next_sibling("div", class_='s').find('span',{'class':'st'}).text
From the current element, it'll find the next div, with class="s". Then you can pull out the <span> tag
Try to use select_one() or select() bs4 methods. They're more flexible and easy to read. CSS selectors reference.
Also, you can pass URL params since requests do everything for you like so:
# instead of this:
query = "pizza recipe"
query = query.replace(' ', '+')
URL = f"https://google.com/search?q={query}"
# try to use this:
params = {
'q': 'fus ro dah', # query
'hl': 'en'
}
requests.get('URL', params=params)
If you want to write to .csv then you need to use .to_csv() rather than .to_excel()
If you want to get rid of pandas index column, then you can pass index=False, e.g df.to_csv('FILE_NAME', index=False)
Code and example in the online IDE:
import pandas as pd
import requests
from bs4 import BeautifulSoup
headers = {
"User-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
'q': 'fus ro dah', # query
'hl': 'en'
}
resp = requests.get("https://google.com/search", headers=headers, params=params)
if resp.status_code == 200:
soup = BeautifulSoup(resp.text, "html.parser")
results = []
for result in soup.select('.tF2Cxc'):
title = result.select_one('.DKV0Md').text
link = result.select_one('.yuRUbf a')['href']
snippet = result.select_one('#rso .lyLwlc').text
item = {
"title": title,
"link": link,
"description": snippet
}
results.append(item)
df = pd.DataFrame(results)
df.to_csv("BS4_Export.csv", index=False)
Alternatively, you can do the same thing by using Google Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in your case is that you don't need to figure out what selectors to use and why they don't work although they should since it's already done for the end-user.
Code to integrate:
from serpapi import GoogleSearch
import os
import pandas as pd
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "fus ro dah",
"hl": "en"
}
search = GoogleSearch(params)
results = search.get_dict()
data = []
for result in results['organic_results']:
title = result['title']
link = result['link']
snippet = result['snippet']
data.append({
"title": title,
"link": link,
"snippet": snippet
})
df = pd.DataFrame(results)
df.to_csv("SerpApi_Export.csv", index=False)
P.S - I wrote a bit more detailed blog post about how to scrape Google Organic Results.
Disclaimer, I work for SerpApi.
I have an issue using Python and BeautifulSoup to extract urls from the Bing search engine. I want to extract content within <div class="b_title"> tags, but when I run this code, the urls var is empty:
import requests, re
from bs4 import BeautifulSoup
payload = { 'q' : 'sport', 'first' : '11' }
headers = { 'User-agent' : 'Mozilla/11.0' }
req = requests.get( 'https://www.bing.com/search', payload, headers=headers )
soup = BeautifulSoup( req.text, 'html.parser' )
urls = soup.find_all('div', class_="b_title")
print urls
You need to go 2 elements above and select li element with its class (it worked for me) or you can use SelectorGadets to grab CSS selectors with select() or select_one() method.
Code and full example:
from bs4 import BeautifulSoup
import requests
import lxml
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
response = requests.get(
"https://www.bing.com/search?form=QBRE&q=lasagna",
headers=headers).text
soup = BeautifulSoup(response, 'lxml')
for container in soup.select('.b_algo h2 a'):
links = container['href']
print(links)
Output:
https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/
https://www.tasteofhome.com/recipes/best-lasagna/
https://www.foodnetwork.com/topics/lasagna
https://www.allrecipes.com/recipes/502/main-dish/pasta/lasagna/
https://www.simplyrecipes.com/recipes/lasagna/
https://www.delish.com/cooking/recipe-ideas/recipes/a51337/classic-lasagna-recipe/
https://www.marthastewart.com/343399/lasagna
https://www.thepioneerwoman.com/food-cooking/recipes/a11728/best-lasagna-recipe/
https://therecipecritic.com/lasagna-recipe/
Alternatively, you can use Bing Search Engine Results API from SerpApi. It's a paid API with a free plan.
Part of JSON:
"organic_results": [
{
"position": 1,
"title": "World's Best Lasagna | Allrecipes",
"link": "https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/",
"displayed_link": "https://www.allrecipes.com/recipe/23600",
"sitelinks": {
"inline": [
{
"title": "Play Video",
"link": "https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/"
}
]
}
}
]
Code to integrate:
import os
from serpapi import GoogleSearch
params = {
"q": "lasagna",
"engine": "bing",
"api_key": os.getenv("API_KEY"),
}
search = GoogleSearch(params)
results = search.get_dict()
for link in results["organic_results"]:
print(f"Link: {link['link']}")
Output:
https://www.allrecipes.com/recipe/23600/worlds-best-lasagna/
https://www.tasteofhome.com/recipes/best-lasagna/
https://www.foodnetwork.com/topics/lasagna
https://www.simplyrecipes.com/recipes/lasagna/
https://www.delish.com/cooking/recipe-ideas/recipes/a51337/classic-lasagna-recipe/
https://www.marthastewart.com/343399/lasagna
https://www.thepioneerwoman.com/food-cooking/recipes/a11728/best-lasagna-recipe/
Disclaimer I work for SerpApi.