Scrape tables with python for player lists - python

I am trying to scrape the EA sports football tables for players to this website:
https://www.easports.com/fifa/ultimate-team/fut/database/results?position_secondary=LF,CF,RF,ST,LW,LM,CAM,CDM,CM,RM,RW,LWB,LB,CB,RB,RWB
I have run this simple code however I am unable to get any output
code:
import requests, bs4
r = requests.get('https://www.easports.com/fifa/ultimate-team/fut/database/results?position_secondary=LF,CF,RF,ST,LW,LM,CAM,CDM,CM,RM,RW,LWB,LB,CB,RB,RWB')
soup = bs4.BeautifulSoup(r.text, 'lxml')
contents = soup.find(class_='contrast-white')
Can anybody help me with it please?

So the problem with that page is that those elements are dynamically generated by javascript.
Fortunately for us, most of the data comes through api calls. So we can use our browser cookies to bypass this limitation and make requests to the actual api.
This is what I came up with, I hope it fits your needs:
import requests
def parse_item(item):
attr_list = item['attributes']
return {
'name': item['name'],
'type': item['playerType'],
'OVR': item['composure'],
'POS': item['position'],
'PAC': get_attr_by_name(attr_list, 'PAC'),
'DRI': get_attr_by_name(attr_list, 'DRI'),
'SHO': get_attr_by_name(attr_list, 'SHO'),
'DEF': get_attr_by_name(attr_list, 'DEF'),
'PAS': get_attr_by_name(attr_list, 'PAS'),
'PHY': get_attr_by_name(attr_list, 'PHY'),
}
def get_attr_by_name(attr_list, attr_name):
attr_name = attr_name.upper()
try:
return next(item['value'] for item in attr_list if item['name'].endswith(attr_name))
except:
return None
cookies = {
'hl': 'us',
'ak_bmsc': '2F856B67859A41FAFB7A62172F068FA7C99F9D14F555000037F4435B86E7E136~plcKkcciaz+3qtfstmojfDw6NLaOVQ0MD41+JJKpeGyyladBNwRB0lLcC8lVi+ELaolN0j0Yzs6HiXjknNAgxjejeFu1I32ZeiaXDNykNhtnNweIIWc26f6y1G6fcpEnkqc2shuFIGn0qSRkilVLfccdJ9pi6yVVjS09lvCSNsi8dNPeU8QUxup+jHmez3zlPebfRyk1zZ8bFb6DBiZ0Dyj6fJepQ89AJ6Kcaf5Ynd3FgefDstwDxcRbDKnssM14iLiSjwri5VWdNP4KtsmmP2as63Xxc5MaVBbTjyk2i5/o8Rj852VMkBWPlskrlkBkliBwOTM4rIFXxZhSSwO2+gog==',
'bm_sv': '830B3A15206003312D12E0B6FB4A2696~GupjwX5n1ZUaBybPwNV8B+/mIEouVASaWGBxPDg0p/S9lbZ98ziLYDEUArV6w2sGEn7NdWMub6mV5tEsGLoEgI48TmNE1/TUwtEyJcmtg2SlGBlGzFi64B2XdCR6oL2xy92x6zdNb6kOL3U+8YaBhQxd5nutL7sFddcENkQOb3E=',
'DOT_COM_PHPSESSID': 'e4r4ekoramipe1qvahf0fp2630',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0',
}
params = {
'jsonParamObject': {
'page': 1,
'position': 'LF,CF,RF,ST,LW,LM,CAM,CDM,CM,RM,RW,LWB,LB,CB,RB,RWB'
}
}
r = requests.get(
'https://www.easports.com/fifa/ultimate-team/api/fut/item',
params=params,
cookies=cookies
)
items = r.json()['items']
data = [parse_item(item) for item in items]
The json is quite big, so I wrote a couple functions to help extract the desired data out of it.
data is a list of dicts. This is what a single element looks like:
>>> data[0]
{'name': 'Cristiano Ronaldo', 'type': 'TEAM OF THE YEAR', 'OVR': 99, 'POS': 'LW', 'PAC': 98, 'DRI': 98, 'SHO': 99, 'DEF': 50, 'PAS': 94, 'PHY': 95}
You may need to change the values on cookies to the ones set by your browser.

Related

Submitting Form Data with Python Requests POST not Working

I'm writing a python script to automatically check dog re-homing sites for dogs that we might be able to adopt as they become available, however I'm stuck completing the form data on this site and can't figure out why.
The form attributes state it should have a post method and I've gone through all of the inputs for the form and created a payload.
I expect the page with the search results to be returned and the html scraped from the results page so I can start processing it, but the scrape is just the form page and never has the results.
I've tried using .get with the payload as params, the url with the payload and using the requests-html library to render any java script elements without success.
If you paste the url_w_payload into a browser it loads the page and says one of the fields is empty. If you then press enter in the url bar again to reload the page without modifying the url it loads... something to do with cookies maybe?
import requests
from requests_html import HTMLSession
session = HTMLSession()
form_url = "https://www.rspca.org.uk/findapet?p_p_id=petSearch2016_WAR_ptlPetRehomingPortlets&p_p_lifecycle=1&p_p_state=normal&p_p_mode=view&_petSearch2016_WAR_ptlPetRehomingPortlets_action=search"
url_w_payload = "https://www.rspca.org.uk/findapet?p_p_id=petSearch2016_WAR_ptlPetRehomingPortlets&p_p_lifecycle=1&p_p_state=normal&p_p_mode=view&_petSearch2016_WAR_ptlPetRehomingPortlets_action=search&noPageView=false&animalType=DOG&freshSearch=false&arrivalSort=false&previousAnimalType=&location=WC2N5DU&previousLocation=&prevSearchedPostcode=&postcode=WC2N5DU&searchedLongitude=-0.1282688&searchedLatitude=51.5072106"
payload = {'noPageView': 'false','animalType': 'DOG', 'freshSearch': 'false', 'arrivalSort': 'false', 'previousAnimalType': '', 'location': 'WC2N5DU', 'previousLocation': '','prevSearchedPostcode': '', 'postcode': 'WC2N5DU', 'searchedLongitude': '-0.1282688', 'searchedLatitude': '51.5072106'}
#req = requests.post(form_url, data = payload)
#with open("requests_output.txt", "w") as f:
# f.write(req.text)
ses = session.post(form_url, data = payload)
ses.html.render()
with open("session_output.txt", "w") as f:
f.write(ses.text)
print("Done")
There's a few hoops to jump with cookies and headers but once you get those right, you'll get the proper response.
Here's how to do it:
import time
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
query_string = {
"p_p_id": "petSearch2016_WAR_ptlPetRehomingPortlets",
"p_p_lifecycle": 1,
"p_p_state": "normal",
"p_p_mode": "view",
"_petSearch2016_WAR_ptlPetRehomingPortlets_action": "search",
}
payload = {
'noPageView': 'false',
'animalType': 'DOG',
'freshSearch': 'false',
'arrivalSort': 'false',
'previousAnimalType': '',
'location': 'WC2N5DU',
'previousLocation': '',
'prevSearchedPostcode': '',
'postcode': 'WC2N5DU',
'searchedLongitude': '-0.1282688',
'searchedLatitude': '51.5072106',
}
def make_cookies(cookie_dict: dict) -> str:
return "; ".join(f"{k}={v}" for k, v in cookie_dict.items())
with requests.Session() as connection:
main_url = "https://www.rspca.org.uk"
connection.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) " \
"AppleWebKit/537.36 (KHTML, like Gecko) " \
"Chrome/90.0.4430.212 Safari/537.36"
r = connection.get(main_url)
cookies = make_cookies(r.cookies.get_dict())
additional_string = f"; cb-enabled=enabled; " \
f"LFR_SESSION_STATE_10110={int(time.time())}"
post_url = f"https://www.rspca.org.uk/findapet?{urlencode(query_string)}"
connection.headers.update(
{
"cookie": cookies + additional_string,
"referer": post_url,
"content-type": "application/x-www-form-urlencoded",
}
)
response = connection.post(post_url, data=urlencode(payload)).text
dogs = BeautifulSoup(response, "lxml").find_all("a", class_="detailLink")
print("\n".join(f"{main_url}{dog['href']}" for dog in dogs))
Output (shortened for brevity and no need to paginate the page as all dogs come in the response):
https://www.rspca.org.uk/findapet/details/-/Animal/JAY_JAY/ref/217747/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/STORM/ref/217054/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/DASHER/ref/205702/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/EVE/ref/205701/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/SEBASTIAN/ref/178975/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/FIJI/ref/169578/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/ELLA/ref/154419/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/BEN/ref/217605/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/SNOWY/ref/214416/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/BENSON/ref/215141/rehome/
https://www.rspca.org.uk/findapet/details/-/Animal/BELLA/ref/207716/rehome/
and much more ...
PS. I really enjoyed this challenge as I have two dogs from a shelter. Keep it up, man!

The python parser does not read information from the site, but returns None

I'm making a python parser for the site: https://www.kinopoisk.ru/lists/series-top250/
The task is to pick film genres from films (displayed on the page as: 'span', class _ = 'selection-film-item-meta__meta-additional-item')
import requests
from bs4 import BeautifulSoup
URL = 'https://www.kinopoisk.ru/lists/series-top250/'
HEADERS = {'user-agent': 'Mozilla/5.1 (Windows NT 7.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
def get_html(url, params = ''):
r = requests.get(url, headers=HEADERS, params=params)
return r
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('span', class_='selection-film-item-meta__meta-additional-item')
cards = []
for item in items:
cards.append(
{
'title': item.find('span', class_='title')
}
)
return cards
html = get_html(URL)
print(get_content(html.text))
I can't understand why it gives the result: [{'title': None}, {'title': None}, {'title': None}, ... {'title': None}]
I'm definitely getting some captcha blocks from my local machine
https://www.kinopoisk.ru/**showcaptcha**?cc=1&retpath=https%3A//www.kinopoisk.ru/lists/series-top250%3F_ea4584...
but running from google colab I was able to reproduce your error, and since you are running from a VPN you probably will not encounter this issue.
The real issue here is that items doesn't have any class title, so naturally your dictionary is being filled with None. Since the class you are looking for has a similar sibling (span with the same class name for country), you would have to skip every other element from the result to get only the film genres.
<span class="selection-film-item-meta__meta-additional-item">США</span>
<span class="selection-film-item-meta__meta-additional-item">мультфильм, фэнтези</span>
<span class="selection-film-item-meta__meta-additional-item">США</span>
<span class="selection-film-item-meta__meta-additional-item">мультфильм, комедия</span>
I would suggest the use a parent element to be able to extract multiple informations from each film card with more specificity.
def get_content(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all('div', class_='selection-film-item-meta selection-film-item-meta_theme_desktop')
cards = []
for item in items:
title = item.find('p', {'class':'selection-film-item-meta__name'})
additional = item.find_all('span', {'class':'selection-film-item-meta__meta-additional-item'})
cards.append(
{
'title': title.get_text(),
'country': additional[0].get_text(),
'genre': additional[1].get_text(),
}
)
return cards
[{
'title': 'Аватар: Легенда об Аанге',
'country': 'США',
'genre': 'мультфильм, фэнтези'
}, {
'title': 'Гравити Фолз',
'country': 'США',
'genre': 'мультфильм, комедия'
}, {
'title': 'Друзья',
'country': 'США',
'genre': 'комедия, мелодрама'
}, {
...
...

How can we reach the information with the opensubtitles API?

I'm trying to take the first download 'str' zip link. I don't need more than one file of information. When I tried a not famous movie such as Shame 2011 My code worked but when I tried Avatar doesn't work. I think the code trying to take a lot of 'str' files information, API after that blocks this request.
**How I can reach the first English str file download link? **
from xmlrpc.client import ServerProxy
from pprint import pprint
imdb='tt0499549'#-->Avatar
#'tt1723811'-->Shame 2011
server = ServerProxy("http://api.opensubtitles.org/xml-rpc")
token = server.LogIn('yourusername', 'yourpassword', 'eng', 'TemporaryUserAgent')['token']
response = server.SearchSubtitles(token, [{'sublanguageid': 'eng', 'query':imdb }])#'moviehash':"0"
pprint(response)
You only have five attempts with TemporaryUserAgent.
Check out opensubtitle's new API - here's the documentation. It's way easier to use than the older API.
Grabbing subtitles is as easy as
headers = {
'Api-Key': api_key,
}
params = (
('imdb_id', movie_id),
)
response = requests.get('https://www.opensubtitles.com/api/v1/subtitles', headers=headers, params=params)
Where api_key is your api_key from their website, and movie_id is the movie's IMDB id (e.g., Titanic's ID is 0120338, and can be found within the URL of its movie page on IMDb - https://www.imdb.com/title/tt0120338/)
An example of the response returned looks like this:
{'id': '5164746',
'type': 'subtitle',
'attributes': {'subtitle_id': '5164746',
'language': 'en',
'download_count': 9608,
'new_download_count': 46,
'hearing_impaired': False,
'hd': True,
'format': None,
'fps': 23.976,
'votes': 0,
'points': 0,
'ratings': 0.0,
'from_trusted': False,
'foreign_parts_only': False,
'auto_translation': False,
'ai_translated': False,
'machine_translated': None,
'upload_date': '2020-02-09T13:59:42Z',
'release': '2160p.4K.BluRay.x265.10bit.AAC5.1-[YTS.MX]',
'comments': "Slightly resynced the 1080p.WEBRip.x264-[YTS.LT] version by explosiveskull to this 4K release. HI removed. I didn't do 4K sync for Infinity War, as they're already on site here:\r\nHi: https://www.opensubtitles.org/en/subtitles/7436082/avengers-infinity-war-en\r\nNo HI: https://www.opensubtitles.org/en/subtitles/7436058/avengers-infinity-war-en",
'legacy_subtitle_id': 8092829,
'uploader': {'uploader_id': 66694,
'name': 'pooond',
'rank': 'bronze member'},
'feature_details': {'feature_id': 626618,
'feature_type': 'Movie',
'year': 2019,
'title': 'Avengers: Endgame',
'movie_name': '2019 - Avengers: Endgame',
'imdb_id': 4154796,
'tmdb_id': 299534},
'url': 'https://www.opensubtitles.com/en/subtitles/legacy/8092829',
'related_links': {'label': 'All subtitles for Avengers: Endgame',
'url': 'https://www.opensubtitles.com/en/movies/2019-untitled-avengers-movie',
'img_url': 'https://s9.osdb.link/features/8/1/6/626618.jpg'},
'files': [{'file_id': 5274788,
'cd_number': 1,
'file_name': 'Avengers.Endgame.2019.2160p.4K.BluRay.x265.10bit.AAC5.1-[YTS.MX].srt'}]}}
To download a file you would take the 'file_id' and input it into a download request to the Open Subtitle API like this:
headers = {
'Api-Key': api_key,
'Authorization': auth,
'Content-Type': 'application/json',
}
data = '{"file_id":5274788}'
response = requests.post('https://www.opensubtitles.com/api/v1/download', headers=headers, data=data)
Where auth is the authorization key you get from their API (/api/v1/login endpoint):
headers = {
'Api-Key': api_key,
'Content-Type': 'application/json',
}
data = '{"username":"__USERNAME","password":"__PASSWORD"}'
response = requests.post('https://www.opensubtitles.com/api/v1/login', headers=headers, data=data)
and __USERNAME and __PASSWORD is your account's username and password.
There is a solution
import requests
import json
from pprint import pprint
url = "https://www.opensubtitles.com/api/v1/login"
headers = {'api-key':'YOUR API KEY', 'content-type': 'application/json'}
user = {'username': 'YOUR USERNAME', 'password': "YOUR USER PASSWORD"}
try:
login_response = requests.post(url, data=json.dumps(user), headers=headers)
login_response.raise_for_status()
login_json_response = login_response.json()
login_token = login_json_response['token']
except:
print("Something wrong check again...")
imdb_id="tt0499549"
headers = {
'Api-Key': 'YOUR API KEY',
}
params = (
('imdb_id', imdb_id),
)
query_response = requests.get('https://www.opensubtitles.com/api/v1/subtitles?', params=params, headers=headers)
query_json_response = query_response.json()
print("Report:",query_response)
#pprint(query_json_response)# All data here...
query_file_name = query_json_response['data'][0]['attributes']['files'][0]['file_name']
query_file_no = query_json_response['data'][0]['attributes']['files'][0]['file_id']
movie_img = query_json_response['data'][0]['attributes']['related_links']['img_url']
print ("Movie Image url:",movie_img)
print("File Number:",query_file_no)
print("Subtile File Name:",query_file_name)
download_url = "https://www.opensubtitles.com/api/v1/download"
download_headers = {'api-key': 'YOUR API KEY',
'authorization':login_token,
'content-type': 'application/json'}
download_file_id = {'file_id': query_file_no}
download_response = requests.post(download_url, data=json.dumps(download_file_id), headers=download_headers)
download_json_response = download_response.json()
print("Report:",download_response)
print(download_json_response)
link=download_json_response['link']
saved_file_name = "subtitle.srt"
r = requests.get(link)
with open(saved_file_name, 'wb') as f:
f.write(r.content)

Use search function on a website with Python Requests(ebay)

I'm trying to create a Python program using the Requests library that searches ebay for an item that they enter. Rather than hard-coding the url, is it possible to use requests library to perform an Ebay search (or a search on any website)?
I believe what you want here is to input a text in a search element. According to realpython:
The requests library is the de facto standard for making HTTP requests in Python.
I would recommend to use selenium to control the website's source code such as inputting a text in an element and press a button on the website.
However, if you still want to use requests then try to find their api endpoint which handle the searching part and use POST method to get data from it.
resp = requests.post(url)
I created and Ebay developer account to access the API then wrote a small script to search eBay for historical pricing on an item. Save it an call is search.py and call it like this:
./search.py "ebay item you are looking for"
You can change the itemFilter to your liking, currently it is set for solditems for since 10-10-2019. The complete list is here: https://developer.ebay.com/devzone/finding/callref/types/ItemFilterType.html
The comments at the bottom show the complete set of fields returned from Ebay, you can pick and choose the fields you like and add them to a print statement.
Also, this script will return for than the first page of items and each page costs you one of your 5,000 developer queries for the day. I am unable to get it to work with the sandbox, not matter what I try. I believe the Ebay sandbox is broken.
#!/usr/local/bin/python3
from ebaysdk.finding import Connection
import sys
DEBUG = False
#search_keywords = "2019 Hot Wheels Dumbo"
search_keywords = sys.argv[1]
print ("Search Keywords: " + search_keywords)
# Function accepts keywords for query and pageNumber of search to pull
# Ebay will only return 100 items per search
def build_request( keywords, pageNumber):
# Create a request structure
# Item Filter List https://developer.ebay.com/devzone/finding/callref/types/ItemFilterType.html
request = {
'keywords': keywords,
'itemFilter': [
{'name': 'condition', 'value': 'new' ,
'name': 'SoldItemsOnly', 'value': True ,
'name': 'EndTimeFrom', 'value': '2019-10-10T00:00:00.000Z' }
],
'paginationInput': {
'entriesPerPage': 100, # EBay limits API Calls to 100 items per page
'pageNumber': pageNumber
},
'sortOrder': 'PricePlusShippingLowest',
}
return (request)
# Connect using yaml file to EBAY-US production site
# put in __main__ just in case we turn this into a module later
if __name__ == '__main__':
api = Connection(config_file='ebay.yaml', debug=False, siteid="EBAY-US")
#api = Connection(config_file='ebay.yaml', debug=False, domain="api.sandbox.ebay.com", siteid="EBAY-US")
# Run the request
query=build_request(search_keywords, 1)
query['paginationInput']['pageNumber'] = 1
response = api.execute('findCompletedItems', query)
if DEBUG:
print (response.dict()) #Use this to see the dictionary structure
# Display how many entries and results are returned
print("API Call: findCompletedItems")
print("----------------------------")
print(f"totalEntries: {response.reply.paginationOutput.totalEntries}, totalPages: {response.reply.paginationOutput.totalPages}")
maxpage = int(str(response.reply.paginationOutput.totalPages)) + 0
# Display item information fields from the request, see below for all possible fields
for item in response.reply.searchResult.item:
print(f"Date: {item.listingInfo.endTime} Title: {item.title}, Price: {item.sellingStatus.currentPrice.value} Shipping: {item.shippingInfo.shippingServiceCost.value}")
# Now run the request for each page and change the page in the request each time
for page in range (2,maxpage):
print ("**** PAGE: "+str(page) +" of "+ str(maxpage)+ " ****")
# Rebuild the Request and Update the Page Number
# Run the request
query['paginationInput']['pageNumber'] = page
response = api.execute('findCompletedItems', query)
# Display item information fields from the request, see below for all possible fields
for item in response.reply.searchResult.item:
print(f"Date: {item.listingInfo.endTime} Title: {item.title}, Price: {item.sellingStatus.currentPrice.value} Shipping: {item.shippingInfo.shippingServiceCost.value}")
#{'ack': 'Success', 'version': '1.13.0', 'timestamp': '2019-10-16T01:28:25.891Z',
#
#searchResult': {'item': [{'itemId': '123719989207', 'title': '2019 HOT WHEELS 2 SET CORVETTE STINGRAY SUPER CHROMES 5/5 TREASURE HUNT PAIR', 'globalId': 'EBAY-US', 'primaryCategory': {'categoryId': '180506', 'categoryName': 'Contemporary Manufacture'}, 'galleryURL': 'https://thumbs4.ebaystatic.com/m/mFuyRQgYjSutGli33dqsqcA/140.jpg', 'viewItemURL': 'https://www.ebay.com/itm/2019-HOT-WHEELS-2-SET-CORVETTE-STINGRAY-SUPER-CHROMES-5-5-TREASURE-HUNT-PAIR-/123719989207', 'paymentMethod': 'PayPal', 'autoPay': 'false', 'postalCode': '54650', 'location': 'Onalaska,WI,USA', 'country': 'US', 'shippingInfo': {'shippingServiceCost': {'_currencyId': 'USD', 'value': '6.0'}, 'shippingType': 'Flat', 'shipToLocations': 'Worldwide', 'expeditedShipping': 'false', 'oneDayShippingAvailable': 'false', 'handlingTime': '2'}, 'sellingStatus': {'currentPrice': {'_currencyId': 'USD', 'value': '9.0'}, 'convertedCurrentPrice': {'_currencyId': 'USD', 'value': '9.0'}, 'sellingState': 'Ended'}, 'listingInfo': {'bestOfferEnabled': 'false', 'buyItNowAvailable': 'false', 'startTime': '2019-04-02T22:14:03.000Z', 'endTime': '2019-10-02T18:44:49.000Z', 'listingType': 'StoreInventory', 'gift': 'false', 'watchCount': '2'}, 'returnsAccepted': 'false', 'condition': {'conditionId': '1000', 'conditionDisplayName': 'New'}, 'isMultiVariationListing': 'false', 'topRatedListing': 'false'},
#
#
#{'itemId': '153679182310', 'title': "Hot Wheels 2019 Super Treasure Hunt '68 Mercury Cougar Loose 1/64 STH Green", 'globalId': 'EBAY-US', 'primaryCategory': {'categoryId': '73252', 'categoryName': 'Collections & Lots'}, 'galleryURL': 'https://thumbs3.ebaystatic.com/m/mEN9EsbCJY0wb6WzXjO8hNg/140.jpg', 'viewItemURL': 'https://www.ebay.com/itm/Hot-Wheels-2019-Super-Treasure-Hunt-68-Mercury-Cougar-Loose-1-64-STH-Green-/153679182310', 'paymentMethod': 'PayPal', 'autoPay': 'false', 'location': 'Malaysia', 'country': 'MY', 'shippingInfo': {'shippingServiceCost': {'_currencyId': 'USD', 'value': '9.0'}, 'shippingType': 'Flat', 'shipToLocations': 'Worldwide', 'expeditedShipping': 'false', 'oneDayShippingAvailable': 'false', 'handlingTime': '15'}, 'sellingStatus': {'currentPrice': {'_currencyId': 'USD', 'value': '9.9'}, 'convertedCurrentPrice': {'_currencyId': 'USD', 'value': '9.9'}, 'bidCount': '1', 'sellingState': 'Ended'}, 'listingInfo': {'bestOfferEnabled': 'false', 'buyItNowAvailable': 'false', 'startTime': '2019-10-10T04:13:32.000Z', 'endTime': '2019-10-15T04:13:32.000Z', 'listingType': 'Auction', 'gift': 'false', 'watchCount': '1'}, 'returnsAccepted': 'false', 'condition': {'conditionId': '3000', 'conditionDisplayName': 'Used'}, 'isMultiVariationListing': 'false', 'topRatedListing': 'false'}],
#
#'_count': '100'}, 'paginationOutput': {'pageNumber': '3', 'entriesPerPage': '100', 'totalPages': '40', 'totalEntries': '3966'}}
You can scrape eBay using BeautifulSoup web scraping library.
In order not to enter the full URL of the request, you can set params in which the necessary request parameters will be indicated and the input of the question itself for the search:
query = input('Your query is: ')
params = {
'_nkw': query, # search query
'_pgn': 1 # page number
#'LH_Sold': '1' # shows sold items
}
If using requests library the request might be blocked as default user-agent in requests library is a python-requests so website understands that's it's a bot or a script that sends a request. Check what's your user-agent.
An additional step besides providing browser user-agent could be to rotate user-agent, for example, to switch between PC, mobile, and tablet, as well as between browsers e.g. Chrome, Firefox, Safari, Edge and so on.
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
}
query = input('Your query is: ')
params = {
'_nkw': query, # search query
'_pgn': 1 # page number
#'LH_Sold': '1' # shows sold items
}
data = []
while True:
page = requests.get('https://www.ebay.com/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
link = products.select_one(".s-item__link")["href"]
data.append({
"title" : title,
"price" : price,
"link" : link
})
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
Your query is: shirt # query entry example
Extracting page: 1
----------
[
{
"title": "Men's Polo Shirt 100% Cotton Knockout Jeans NVY WHT 220 Stripe MEDIUM Free Ship",
"price": "$11.99",
"link": "https://www.ebay.com/itm/133992813518?hash=item1f329813ce:g:tWMAAOSwXBxhTP7Q&amdata=enc%3AAQAHAAAAwJ9%2BDbqKGCoZye6JelYY1tJHQWotUalKHQJ%2FixwyplnvOC60SofXkLVsNgRfoX09uOZLerjkBtwcW%2FQQa1wmJ6%2BYVEEagzH1GAK6Bx4rX%2BRNnj9g6SlvB2WagWETpbmrLdiFHGTIRvAL2EvfXDRqPFnEGWZ2nk%2BM0zEkiGzp%2F4ADUbPslGui3zTDJsIgVpXjAHzL2EUH3s7tiOxtd3qVTXxaE095evq5YrBgkJFJu4KB5o%2F%2BCiCURfy7xR%2FbTU7mnQ%3D%3D%7Ctkp%3ABlBMUJavlrOEYQ"
},
{
"title": "5 Pack Oroblu Micromodal Perfect Line Round Neck Short Sleeve T-Shirt",
"price": "$192.00",
"link": "https://www.ebay.com/itm/275287531865?hash=item40186a6159:g:OtUAAOSweKFiZr2S&amdata=enc%3AAQAHAAAAsMRLg1VeYAIKHTiXXdD8xv56DpaeH6jc3EhFP26RJ66bqmlzXHQrMMxuo78x6S2i8DfxvuzjbXrpmYYdyRLhzgQCoaauMNvRwVNuhx11qorNlPoHrig%2BdIGG2RB4xHmXdB2fjOciLCsdYkL23jaH23ehXakQu%2BrBzER%2F2v94Sdg%2BkchjwWmRidsv0kPfLRcpiy%2BOeDBHEas4i9EQY%2F0VAzLGj2U%2FwLdcqjqSjgngj%2BRr%7Ctkp%3ABlBMUJavlrOEYQ"
},
# ...
]
As an alternative, you can use Ebay Organic Results API from SerpApi. It`s a paid API with a free plan that handles blocks and parsing on their backend.
Example code that paginates through all pages with input query:
from serpapi import EbaySearch
import os, json
query = input('Your query is: ')
params = {
"api_key": os.getenv("API_KEY"), # serpapi api key
"engine": "ebay", # search engine
"ebay_domain": "ebay.com", # ebay domain
"_nkw": query, # search query
"_pgn": 1 # page number
#"LH_Sold": "1" # shows sold items
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
link = organic_result.get("link")
price = organic_result.get("price")
data.append({
"price" : price,
"link" : link
})
page_num += 1
print(page_num)
if "next" in results.get("pagination", {}):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"price": {
"raw": "$25.99",
"extracted": 25.99
},
"link": "https://www.ebay.com/itm/285018595898?hash=item425c6ea23a:g:mT0AAOSwBjljAFsl&amdata=enc%3AAQAHAAAAkI1P1C%2BE2boIutliCMWXCADm%2BXyUp2a6Q1qOjpifaAIo6%2FWD0yHCd8Mejyfc2jc%2BQ5zzVcITrcWM0XxIfiSUILMZFsMewB154skl5re5%2FS8W9kRrabjRdy%2BoC6aQoS%2FWGq%2F6A%2BZWQ1GQkcd5Tstamu%2FgzZKoL6VYfO4YpC4oO4Im23h0wiIfI0%2BxPG8uuFRMPw%3D%3D%7Ctkp%3ABk9SR_i1vbKEYQ"
},
{
"price": {
"raw": "$14.16",
"extracted": 14.16
},
"link": "https://www.ebay.com/itm/234347615312?hash=item369034d450:g:hvYAAOSwNspg0TAH&amdata=enc%3AAQAHAAAA0B1m3DPC4q0R4AQp6MO8rXnKt6qFIX2p%2BaypmySYXkIvi6XE3FHzpbtN%2B%2Bvd9P3TZPYu3fuQVl5kH0ZYDO5eqtnjh1EcZ%2Fb9rZMlMx6r6RcH%2B5wOY7X65bvRcmQ7OUmoaNGAMOZpOc4hg8vHj2afxCa%2FR7F3jDr1KjnHk%2BKnln3opoiqAVMFIoXv338f70KZw8CDd%2Fg9xU0jQlzgxDpDwSL6Y6OMz0oKxh4T%2BRUMKHj03VE5E9%2B8VKzPUMWAQ%2BZWuZyGMpWxwzn%2BomggywV5RhI%3D%7Ctkp%3ABk9SR_i1vbKEYQ"
},
# ...
]

Using mechanize bing search returns blank page

I am using mechanize to perform a bing search and then I will process the results with beautiful soup. I have successfully performed google and yahoo searches with this same method but when I do a bing search all I get is a blank page.
I am thoroughly confused why this is the case and if anyone can shed any light on the matter that would be greatly appreciated. Here is a sample of the code I'm using:
from BeautifulSoup import BeautifulSoup
import mechanize
br = mechanize.Browser()
br.set_handle_robots(False)
br.open("http://www.bing.com/search?count=100&q=cheese")
content = br.response()
content = content.read()
soup = BeautifulSoup(content, convertEntities=BeautifulSoup.ALL_ENTITIES)
print soup
The result is a blank line printed.
You probably got response that answer is already in your browser cache. Try changing a little you query string, for example decrease count to 50.
You can also add some debugging code and see headers returned by server:
br.open("http://www.bing.com/search?count=50&q=cheese")
response = br.response()
headers = response.info()
print headers
content = response.read()
EDIT:
I have tried this query with count=100 with Firefox and Opera browsers and it seems that bing do not like such a "big" count. When I decrease count then it works. So this is not mechanize or other Python library fault, but your query is problematic to bing. It also seems that browser can query bing with count=100 but it must first query bing with some smaller count. Strange!
Another way to achieve this is by using requests with beautifulsoup
Code and example in online IDE:
from bs4 import BeautifulSoup
import requests, lxml, json
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
def get_organic_results():
html = requests.get('https://www.bing.com/search?q=nfs', headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
bing_data = []
for result in soup.find_all('li', class_='b_algo'):
title = result.h2.text
try:
link = result.h2.a['href']
except:
link = None
displayed_link = result.find('div', class_='b_attribution').text
try:
snippet = result.find('div', class_='b_caption').p.text
except:
snippet = None
for inline in soup.find_all('div', class_='b_factrow'):
try:
inline_title = inline.a.text
except:
inline_title = None
try:
inline_link = inline.a['href']
except:
inline_link = None
bing_data.append({
'title': title,
'link': link,
'displayed_link': displayed_link,
'snippet': snippet,
'inline': [{'title': inline_title, 'link': inline_link}]
})
print(json.dumps(bing_data, indent = 2))
# part of the created json output:
'''
[
{
"title": "Need for Speed Video Games - Official EA Site",
"link": "https://www.ea.com/games/need-for-speed",
"displayed_link": "https://www.ea.com/games/need-for-speed",
"snippet": "Need for Speed Forums Buy Now All Games Forums Buy Now Learn More Buy Now Hit the gas and tear up the roads in this legendary action-driving series. Push your supercar to its limits and leave the competition in your rearview or shake off a full-scale police pursuit \u2013 it\u2019s all just a key-turn away.",
"inline": [
{
"title": null,
"link": null
}
]
}
]
'''
Alternatively, you can do the same thing using Bing Organic Results API from SerpApi. It's a paid API with a free trial of 5,000 searches.
Code to integrate:
from serpapi import GoogleSearch
import os
def get_organic_results():
params = {
"api_key": os.getenv('API_KEY'),
"engine": "bing",
"q": "nfs most wanted"
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['organic_results']:
title = result['title']
link = result['link']
displayed_link = result['displayed_link']
try:
snippet = result['snippet']
except:
snippet = None
try:
inline = result['sitelinks']['inline']
except:
inline = None
print(f'{title}\n{link}\n{displayed_link}\n{snippet}\n{inline}\n')
# part of the output:
'''
Need for Speed: Most Wanted - Car Racing Game - Official ...
https://www.ea.com/games/need-for-speed/need-for-speed-most-wanted
https://www.ea.com/games/need-for-speed/need-for-speed-most-wanted
Jun 01, 2017 · To be Most Wanted, you’ll need to outrun the cops, outdrive your friends, and outsmart your rivals. With a relentless police force gunning to take you down, you’ll need to make split-second decisions. Use the open world to …
[{'title': 'Need for Speed No Limits', 'link': 'https://www.ea.com/games/need-for-speed/need-for-speed-no-limits'}, {'title': 'Buy Now', 'link': 'https://www.ea.com/games/need-for-speed/need-for-speed-heat/buy'}, {'title': 'Need for Speed Undercover', 'link': 'https://www.ea.com/games/need-for-speed/need-for-speed-undercover'}, {'title': 'Need for Speed The Run', 'link': 'https://www.ea.com/games/need-for-speed/need-for-speed-the-run'}, {'title': 'News', 'link': 'https://www.ea.com/games/need-for-speed/need-for-speed-payback/news'}]
'''
Disclaimer, I work for SerpApi.

Categories