I am trying to web scrape some useful data on academic papers from Google Scholar.
So far I've had no problem getting the Title, Year of publication, Citation count, and "Cited by" URL.
I would like now to get the full citation that includes the full authors' list, journal, pages (if any) etc... (see snapshot below)
Full APA citation appearing when clicking on the double quote (circled in red)
I use ScraperAPI to handle proxies and CAPTCHAs (they offer 5000 requests for free).
Below is the code I have (I'm aware it's very heavy and not optimal at all, but does the job for now):
import requests
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
APIKEY = "????????????????????"
BASE_URL = f"http://api.scraperapi.com?api_key={APIKEY}&url="
def scraper_api(query, n_pages):
"""Uses scraperAPI to scrape Google Scholar for
papers' Title, Year, Citations, Cited By url returns a dataframe
---------------------------
parameters:
query: in the following format "automation+container+terminal"
n_pages: number of pages to scrape
---------------------------
returns:
dataframe with the following columns:
"Title": title of each papers
"Year": year of publication of each paper
"Citations": citations count
"cited_by_url": URL given by "cited by" button, reshaped to allow further
scraping
---------------------------"""
pages = np.arange(0,(n_pages*10),10)
papers = []
for page in pages:
print(f"Scraping page {int(page/10) + 1}")
webpage = f"https://scholar.google.com/scholar?start={page}&q={query}&hl=fr&as_sdt=0,5"
url = BASE_URL + webpage
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
for paper in soup.find_all("div", class_="gs_ri"):
# get the title of each paper
title = paper.find("h3", class_="gs_rt").find("a").text
if title == None:
title = paper.find("h3", class_="gs_rt").find("span").text
# get the year of publication of each paper
txt_year = paper.find("div", class_="gs_a").text
year = re.findall('[0-9]{4}', txt_year)
if year:
year = list(map(int,year))[0]
else:
year = 0
# get number of citations for each paper
txt_cite = paper.find("div", class_="gs_fl").find_all("a")[2].string
if txt_cite:
citations = re.findall('[0-9]+', txt_cite)
if citations:
citations = list(map(int,citations))[0]
else:
citations = 0
else:
citations = 0
# get the "cited_by" url for later scraping of citing papers
# had to extract the "href" tag and then reshuffle the url as not
# following same pattern for pagination
urls = paper.find("div", class_="gs_fl").find_all(href=True)
if urls:
for url in urls:
if "cites" in url["href"]:
cited_url = url["href"]
index1 = cited_url.index("?")
url_slices = []
url_slices.append(cited_url[:index1+1])
url_slices.append(cited_url[index1+1:])
index_and = url_slices[1].index("&")
url_slices.append(url_slices[1][:index_and+1])
url_slices.append(url_slices[1][index_and+1:])
url_slices.append(url_slices[3][:23])
del url_slices[1]
new_url = "https://scholar.google.com.tw"+url_slices[0]+"start=00&hl=en&"+url_slices[3]+url_slices[1]+"scipsc="
else:
new_url = "no citations"
# appends everything in a list of dictionaries
papers.append({'title': title, 'year': year, 'citations': citations, 'cited_by_url': new_url})
# converts the list of dict to a pandas df
papers_df = pd.DataFrame(papers)
return papers_df
I would like to retrieve the full APA citation but seems like it's not on the same HTML page and there are no href associated.
If you have any lead that would help me a lot!! Thanks :)
Open F12, go under the network tab then click on "citation symbol". You should see a request appear. The url of the request is as :
"https://scholar.google.com/scholar?q=info:dgGDGDdf5:scholar.google.com/&output=cite&scirp=0&hl=fr"
where "dgGDGDdf5" is the "data-cid" findable in each div-row of the main page. Each "data-cid" correspond to an unique article.
So, extract this "data-cid" and make a sub-request with this url then extract APA or other citation form.
Implementation example :
import requests as rq
from bs4 import BeautifulSoup as bs
from urllib.parse import urlencode
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}
def google_scholar(query, n_pages, since_year):
data = []
encoded_query = urlencode({"q": query})
for start in range(0, n_pages*10, 10):
url = "https://scholar.google.com/scholar?as_ylo=%s&%s&hl=fr&start=%s" % (since_year, encoded_query, start)
resp = rq.get(url, headers=headers)
soup = bs(resp.content, "lxml")
print(soup)
main_div = soup.find_all('div', {'id': 'gs_res_ccl_mid'})[0]
divs = main_div.find_all('div', {'class': 'gs_r gs_or gs_scl'})
for div in divs:
data_cid = div['data-cid']
print(data_cid)
title = div.find_all('h3', {'class': 'gs_rt'})[0].text
infos = div.find_all('div', {'class': 'gs_a'})[0].text
# APA citation
url_cite = "https://scholar.google.com/scholar?q=info:%s:scholar.google.com/&output=cite&scirp=0&hl=fr" % (data_cid)
resp2 = rq.get(url_cite, headers=headers)
# --> extract apa here from resp2
data-cid attribute is a unique publication ID. You need to parse all of them from the page, make another request to citation URL with parsed data-cid as ce.teuf stated.
The example below will work for ~10-20 requests then Google will throw a CAPTCHA or you'll hit the rate limit. The ideal solution is to have a CAPTCHA solving service as well as proxies.
Example code:
from bs4 import BeautifulSoup
import requests, lxml
params = {
"q": "automated container terminal", # search query
"hl": "en" # language
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.3538.102 Safari/537.36 Edge/18.19582",
'accept-language': 'en-US,en',
'accept': 'text/html,application/xhtml+xml,application/xml',
"server": "scholar",
"referer": f"https://scholar.google.com/scholar?hl={params['hl']}&q={params['q']}",
}
def cite_ids() -> list[str]:
response = requests.get("https://scholar.google.com/scholar", params=params, headers=headers)
soup = BeautifulSoup(response.text, "lxml")
# returns a list of publication ID's -> U8bh6Ca9uwQJ
return [result["data-cid"] for result in soup.select(".gs_or")]
def scrape_cite_results() -> list[dict[str]]:
cited_authors = []
for cite_id in cite_ids():
response = requests.get(f"https://scholar.google.com/scholar?output=cite&q=info:{cite_id}:scholar.google.com", headers=headers)
soup = BeautifulSoup(response.text, "lxml")
for result in soup.select("tr"):
if "APA" in result.select_one("th"):
title = result.select_one("th").text
authors = result.select_one("td").text
cited_authors.append({"title": title, "cited_authors": authors})
return cited_authors
Alternatively, you can achieve it using Google Scholar Organic Results API from SerpApi. It's a paid API with a free plan.
The difference in such scenario is that you don't have to tinker selectors in order to find the proper one or figure out how to bypass blocks from Google if you send a bunch of requests and hit a IP rate limit or it will throw a CAPTCHA.
Code to integrate:
import os, json
from serpapi import GoogleSearch
def organic_results() -> list[str]:
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar",
"q": "automated container terminal", # search query
"hl": "en" # language
}
search = GoogleSearch(params)
results = search.get_dict()
return [result["result_id"] for result in results["organic_results"]]
def cite_results() -> list[dict[str]]:
citation_results = []
for citation_id in organic_results():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google_scholar_cite",
"q": citation_id
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results["citations"]:
if "APA" in result["title"]:
institution = result["title"]
authors = result["snippet"]
citation_results.append({
"institution": institution,
"authors": authors
})
return citation_results
print(json.dumps(cite_results(), indent=2))
'''
[
{
"institution": "APA",
"authors": "Vis, I. F., & Harika, I. (2004). Comparison of vehicle types at an automated container terminal. OR Spectrum, 26(1), 117-143."
},
{
"institution": "APA",
"authors": "Vis, I. F., De Koster, R., Roodbergen, K. J., & Peeters, L. W. (2001). Determination of the number of automated guided vehicles required at a semi-automated container terminal. Journal of the Operational research Society, 52(4), 409-417."
},
{
"institution": "APA",
"authors": "Zhen, L., Lee, L. H., Chew, E. P., Chang, D. F., & Xu, Z. X. (2011). A comparative study on two types of automated container terminal systems. IEEE Transactions on Automation Science and Engineering, 9(1), 56-69."
},
{
"institution": "APA",
"authors": "Liu, C. I., Jula, H., & Ioannou, P. A. (2002). Design, simulation, and evaluation of automated container terminals. IEEE Transactions on intelligent transportation systems, 3(1), 12-26."
},
{
"institution": "APA",
"authors": "Park, T., Choe, R., Kim, Y. H., & Ryu, K. R. (2011). Dynamic adjustment of container stacking policy in an automated container terminal. International Journal of Production Economics, 133(1), 385-392."
},
{
"institution": "APA",
"authors": "Bae, H. Y., Choe, R., Park, T., & Ryu, K. R. (2011). Comparison of operations of AGVs and ALVs in an automated container terminal. Journal of Intelligent Manufacturing, 22(3), 413-426."
},
{
"institution": "APA",
"authors": "Luo, J., Wu, Y., & Mendes, A. B. (2016). Modelling of integrated vehicle scheduling and container storage problems in unloading process at an automated container terminal. Computers & Industrial Engineering, 94, 32-44."
},
{
"institution": "APA",
"authors": "Zhu, M., Fan, X., Cheng, H., & He, Q. (2010). Modeling and Simulation of Automated Container Terminal Operation. J. Comput., 5(6), 951-957."
},
{
"institution": "APA",
"authors": "Luo, J., & Wu, Y. (2020). Scheduling of container-handling equipment during the loading process at an automated container terminal. Computers & Industrial Engineering, 149, 106848."
},
{
"institution": "APA",
"authors": "Yang, X., Mi, W., Li, X., An, G., Zhao, N., & Mi, C. (2015). A simulation study on the design of a novel automated container terminal. IEEE Transactions on Intelligent Transportation Systems, 16(5), 2889-2899."
}
]
'''
Disclaimer, I work for SerpApi.
Related
I will use this code to explain my doubt:
Using the url without sold filter
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.ebay.es/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw=iphone+x&_sacat=0&LH_TitleDesc=0&_udlo=400&LH_Auction=1&_osacat=0&_odkw=Pok%C3%A9mon+card+Charizard+4%2F102&rt=nc"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find_all("div", {"class": "s-item__info clearfix"})
print(len(results))
Output: 12
Then I use the url where there are only sold items, I check the html and the class is the same.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.ebay.es/sch/i.html?_from=R40&_nkw=iphone+x&_sacat=0&LH_TitleDesc=0&_udlo=400&LH_Auction=1&rt=nc&LH_Sold=1&LH_Complete=1"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find_all("div", {"class": "s-item__info clearfix"})
print(len(results))
Output: 0
I tried different classes but I can´t never obtain something.
Thanks.
It was a captcha problem. tHanks!
There are several reasons why the output will be empty.
This is often because the site may think it is being accessed by a bot if requests is the default user-agent in the requests library is python-requests, this can be prevented by passing your actual User-Agent to the "headers". This seems to be a reason why you get a CAPTCHA.
The next step would be if User-Agent passing didn't work would be to use rotate user-agent, for example, to switch between PC, mobile, and tablet, as well as between browsers e.g. Chrome, Firefox, Safari, Edge and so on.
Also if passing request headers is not enough. That's when you can try using proxies (ideally residential) in combination with request headers.
An additional step is to use CAPTCHA solver, for example, 2captcha. It allows bypassing all possible CAPTCHAs depending on the target website.
Check the code using BeautifulSoup in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
}
params = {
'_nkw': 'iphone_x', # search query
'LH_Sold': '1', # shows sold items
'_pgn': 1 # page number
}
data = []
limit = 10 # page limit (if needed)
while True:
page = requests.get('https://www.ebay.es/sch/i.html', params=params, headers=headers, timeout=30)
soup = BeautifulSoup(page.text, 'lxml')
print(f"Extracting page: {params['_pgn']}")
print("-" * 10)
for products in soup.select(".s-item__info"):
title = products.select_one(".s-item__title span").text
price = products.select_one(".s-item__price").text
data.append({
"title" : title,
"price" : price
})
if params['_pgn'] == limit:
break
if soup.select_one(".pagination__next"):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Apple iPhone X 64 GB y 256 GB Grado A++ Desbloqueado - Excelente Estado Todos los Colores",
"price": "234,52 EUR"
},
{
"title": "Funda de silicona a prueba de golpes para iPhone 11 Pro Max 14Pro 8 7 SE 2022 colores",
"price": "4,56 EUR"
},
{
"title": "Apple iPhone X 64 GB 256 GB gris plateado sin contrato COMO NUEVO SIN MANCHA Wow ",
"price": "377,00 EUR a 409,00 EUR"
},
{
"title": "Funda transparente de silicona completa a prueba de golpes para iPhone 11 12 13 14 PRO MAX Mini X XR 8",
"price": "1,13 EUR a 4,06 EUR"
},
{
"title": "Apple iPhone X - 256 GB - Plateado (Desbloqueado) (Leer descripción) FA1065",
"price": "163,88 EUR"
},
other results ...
]
Also you can using official eBay Finding API, has a limit of 5000 requests per day, or third-party API like Ebay Organic Results API from SerpApi. It's a paid API with a free plan that handles blocks and parsing on their backend.
Example code with pagination:
from serpapi import EbaySearch
import json
params = {
"api_key": "...", # serpapi key, https://serpapi.com/manage-api-key
"engine": "ebay", # search engine
"ebay_domain": "ebay.es", # ebay domain
"_nkw": "iphone_x", # search query
"LH_Sold": "1", # shows sold items
"_pgn": 1 # page number
}
search = EbaySearch(params) # where data extraction happens
page_num = 0
data = []
while True:
results = search.get_dict() # JSON -> Python dict
if "error" in results:
print(results["error"])
break
for organic_result in results.get("organic_results", []):
title = organic_result.get("title")
price = organic_result.get("price")
data.append({
"title" : title,
"price" : price
})
page_num += 1
print(page_num)
if "next" in results.get("pagination", {}):
params['_pgn'] += 1
else:
break
print(json.dumps(data, indent=2))
Output:
[
{
"title": "Apple iPhone X (10) Desbloqueado 64 GB/256 GB Gris espacial/Plateado - Excelente Estado",
"price": {
"raw": "297,34 EUR",
"extracted": 297.34
}
},
{
"title": "Nuevo anuncioApple iPhone X - 64GB - Bianco (Sbloccato).",
"price": {
"raw": "340,00 EUR",
"extracted": 340.0
}
},
{
"title": "Apple iPhone X - 256GB - Factory Unlocked - Good Condition",
"price": {
"raw": "230,80 EUR",
"extracted": 230.8
}
},
other results ...
]
Hello fellow developer out there,
I'm new to Python & I need to write a web scraper to catch info from Scholar Google.
I ended up coding this function to get values using Xpath:
thread = browser.find_elements(By.XPATH,(" %s" % exp))
xArray = []
for t in thread:
if not atr:
xThread = t.text
else:
xThread = t.get_attribute('href')
xArray.append(xThread)
return xArray
I don't know if it's a good or a bad solution. So, I humbly accept any suggestions to make it work better.
Anyway, my actual problem is that I am getting all authors name from the page I am scraping and what I really need are the names, grouped by result.
When I ask to print the results I wish I could have something like this:
[[author1, author2,author 3],[author 4,author 5,author6]]
What am I getting right now is:
[author1,author3,author4,author5,author6]
The structure is as follows:
<div class="gs_a">
LR Hisch,
AM Gobin
,AR Lowery,
F Tam
... -Annals of biomedical ...,2006 - Springer
</div>
And the same structure is repetead all over the page for different documents and authors.
And this is the call to the function I explained earlier:
authors = (clothoSpins(".//*[#class='gs_a']//a"))
Which gets me the entire list of authors.
Here is the logic (used selenium in the below code but update it as per your need).
Logic:
url = "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C21&q=python&btnG="
driver.get(url)
# get the authors and add to list
listBooks = []
books = driver.find_elements_by_xpath("//div[#class='gs_a']")
for bookNum in books:
auths = []
authors = driver.find_elements_by_xpath("(//div[#class='gs_a'])[%s]/a|(//div[#class='gs_a'])[%s]/self::*[not(a)]"%(bookNum+1,bookNum+1))
for author in authors:
auths.append(author.text)
listBooks.append(auths)
Output:
[['F Pedregosa', 'G Varoquaux', 'A Gramfort'], ['PD Adams', 'PV Afonine'], ['TE Oliphant'], ['JW Peirce'], ['S Anders', 'PT Pyl', 'W Huber'], ['MF Sanner'], ['S Bird', 'E Klein'], ['M Lutz - 2001 - books.google.com'], ['G Rossum - 1995 - dl.acm.org'], ['W McKinney - … of the 9th Python in Science Conference, 2010 - pdfs.semanticscholar.org']]
Screenshot:
To group by result you can create an empty list, iterate over results, and append extracted data to the list as a dict, and returned result could be serialized to a JSON string using json_dumps() method e.g:
temp_list = []
for result in results:
# extracting title, link, etc.
temp_list.append({
"title": title,
# other extracted elements
})
print(json.dumps(temp_list, indent=2))
"""
Returned results is a list of dictionaries:
[
{
"title": "A new biology for a new century",
# other extracted elements..
}
]
"""
Code and full example in the online IDE:
from parsel import Selector
import requests, json, re
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "biology", # search query
"hl": "en" # language
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.87 Safari/537.36",
}
html = requests.get("https://scholar.google.com/scholar", params=params, headers=headers, timeout=30)
selector = Selector(html.text)
data = []
for result in selector.css(".gs_ri"):
# xpath("normalize-space()") to get blank text nodes as well to get the full string output
title = result.css(".gs_rt a").xpath("normalize-space()").get()
# https://regex101.com/r/7bmx8h/1
authors = re.search(r"^(.*?)-", result.css(".gs_a").xpath("normalize-space()").get()).group(1).strip()
snippet = result.css(".gs_rs").xpath("normalize-space()").get()
# https://regex101.com/r/47erNR/1
year = re.search(r"\d+", result.css(".gs_a").xpath("normalize-space()").get()).group(0)
# https://regex101.com/r/13468d/1
publisher = re.search(r"\d+\s?-\s?(.*)", result.css(".gs_a").xpath("normalize-space()").get()).group(1)
cited_by = int(re.search(r"\d+", result.css(".gs_or_btn.gs_nph+ a::text").get()).group(0))
data.append({
"title": title,
"snippet": snippet,
"authors": authors,
"year": year,
"publisher": publisher,
"cited_by": cited_by
})
print(json.dumps(data, indent=2, ensure_ascii=False))
Output:
[
{
"title": "A new biology for a new century",
"snippet": "… A society that permits biology to become an engineering discipline, that allows that science … science of biology that helps us to do this, shows the way. An engineering biology might still …",
"authors": "CR Woese",
"year": "2004",
"publisher": "Am Soc Microbiol",
"cited_by": 743
}, ... other results
{
"title": "Campbell biology",
"snippet": "… Now, Campbell series Biology texts are institutionalized. This is the standard biology text across colleges in the US To say the authors and editors know what they are doing at this point …",
"authors": "JB Reece, LA Urry, ML Cain, SA Wasserman…",
"year": "2014",
"publisher": "fvsuol4ed.org",
"cited_by": 1184
}
]
Note: in the example above, I'm using parsel library which is very similar to beautifulsoup and selenium in terms of data extraction.
Alternatively, you can achieve the same thing by using Google Scholar Organic Results API from SerpApi. It's a paid API with a free plan.
The difference is that you don't have to create the parser from scratch, maintain it, figure out how to scale it without getting blocked.
Example code to integrate:
from serpapi import GoogleSearch
import os, json
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar", # parsing engine
"q": "biology", # search query
"hl": "en" # language
}
search = GoogleSearch(params) # where data extraction happens
results = search.get_dict() # JSON -> Python dictionary
for result in results["organic_results"]:
print(json.dumps(result, indent=2))
Output:
{
"position": 0,
"title": "A new biology for a new century",
"result_id": "KNJ0p4CbwgoJ",
"link": "https://journals.asm.org/doi/abs/10.1128/MMBR.68.2.173-186.2004",
"snippet": "\u2026 A society that permits biology to become an engineering discipline, that allows that science \u2026 science of biology that helps us to do this, shows the way. An engineering biology might still \u2026",
"publication_info": {
"summary": "CR Woese - Microbiology and molecular biology reviews, 2004 - Am Soc Microbiol"
},
"resources": [
{
"title": "nih.gov",
"file_format": "HTML",
"link": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC419918/"
},
{
"title": "View it # CTU",
"link": "https://scholar.google.com/scholar?output=instlink&q=info:KNJ0p4CbwgoJ:scholar.google.com/&hl=en&as_sdt=0,11&scillfp=15047057806408271473&oi=lle"
}
],
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=KNJ0p4CbwgoJ",
"html_version": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC419918/",
"cited_by": {
"total": 743,
"link": "https://scholar.google.com/scholar?cites=775353062728716840&as_sdt=80005&sciodt=0,11&hl=en",
"cites_id": "775353062728716840",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=80005&cites=775353062728716840&engine=google_scholar&hl=en"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:KNJ0p4CbwgoJ:scholar.google.com/&scioq=biology&hl=en&as_sdt=0,11",
"versions": {
"total": 20,
"link": "https://scholar.google.com/scholar?cluster=775353062728716840&hl=en&as_sdt=0,11",
"cluster_id": "775353062728716840",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C11&cluster=775353062728716840&engine=google_scholar&hl=en"
}
}
}
{
"position": 9,
"title": "Campbell biology",
"result_id": "YnWp49O_RTMJ",
"type": "Book",
"link": "http://www.fvsuol4ed.org/reviews/Biology%20Organismal%20Template_Campbell%20Biology_Moran.pdf",
"snippet": "\u2026 Now, Campbell series Biology texts are institutionalized. This is the standard biology text across colleges in the US To say the authors and editors know what they are doing at this point \u2026",
"publication_info": {
"summary": "JB Reece, LA Urry, ML Cain, SA Wasserman\u2026 - 2014 - fvsuol4ed.org"
},
"resources": [
{
"title": "fvsuol4ed.org",
"file_format": "PDF",
"link": "http://www.fvsuol4ed.org/reviews/Biology%20Organismal%20Template_Campbell%20Biology_Moran.pdf"
}
],
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=YnWp49O_RTMJ",
"cited_by": {
"total": 1184,
"link": "https://scholar.google.com/scholar?cites=3694569986105898338&as_sdt=80005&sciodt=0,11&hl=en",
"cites_id": "3694569986105898338",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=80005&cites=3694569986105898338&engine=google_scholar&hl=en"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:YnWp49O_RTMJ:scholar.google.com/&scioq=biology&hl=en&as_sdt=0,11",
"versions": {
"total": 33,
"link": "https://scholar.google.com/scholar?cluster=3694569986105898338&hl=en&as_sdt=0,11",
"cluster_id": "3694569986105898338",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C11&cluster=3694569986105898338&engine=google_scholar&hl=en"
},
"cached_page_link": "http://scholar.googleusercontent.com/scholar?q=cache:YnWp49O_RTMJ:scholar.google.com/+biology&hl=en&as_sdt=0,11"
}
}
If you need to parse data from all Google Scholar Organic results, there's a dedicated Scrape historic 2017-2021 Organic, Cite Google Scholar results to CSV, SQLite blog post of mine at SerpApi that shows how to do it with API.
Disclaimer, I work for SerpApi.
I looking for a way to search google with python and store each website into a slot in an data list. Im looking for something like the example code below.
search=input('->')
results=google.search((search),(10))
print results
In this case i want it to search google for whatever is in the variable "search", 10 is the amount of results I want to store in the variable and finally putting them on the screen with the "print results".
I would appreciate any help or anything similar to what I want. Thanks.
As mentioned above google does provide an api for completing searches (https://developers.google.com/custom-search/json-api/v1/overview), and as mentioned depending on what you are trying to accomplish can get quite expensive. Another option is to scrap the google page. Below is an example I created using Beautiful Soup (https://www.crummy.com/software/BeautifulSoup/bs4/doc/#) to scrap the google result.
import urllib2
import xml.etree.ElementTree
from bs4 import BeautifulSoup #install using 'pip install beautifulsoup4'
'''
Since spaces will not work in url parameters, the spaces have to be converted int '+'
ex) "example text" -> "example+text"
'''
def spacesToPluses(string):
words = string.split(" ")
convertedString = ""
for i in range(0, len(words)):
convertedString += words[i] + "+"
return convertedString[0:len(convertedString)-1]
'''
Opens the url with the parameter included and reads it as a string
'''
def getRawGoogleResponse(url):
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,} #Required for google to allow url request
request=urllib2.Request(url,None,headers)
response = urllib2.urlopen(request)
rawResponse = response.read()
return rawResponse
'''
Takes in the raw string representation and converts it into an easier to navigate object (Beautiful Soup)
'''
def getParsedGoogleResponse(url):
rawResponse = getRawGoogleResponse(url)
fullPage = BeautifulSoup(rawResponse, 'html.parser')
return fullPage
'''
Finds all of the urls on a single page
'''
def getGoogleResultsOnPage(fullPage):
searchResultContainers = fullPage.find_all("h3", {"class": "r"}) #the results are contained in an h3 element with the class 'r'
pageUrls = []
for container in searchResultContainers: #get each link in the container
fullUrl = container.find('a')['href']
beginningOfUrl = fullUrl.index('http')
pageUrls.append(fullUrl[beginningOfUrl:])#Chops off the extra bits google adds to the url
return pageUrls
'''
Returns number of pages (max of 10)
'''
def getNumPages(basePage):
navTable = basePage.find("table", {"id" : "nav"}) #The nav table contains the number of pages (up to 10)
pageNumbers = navTable.find_all("a", {"class" : "fl"})
lastPageNumber = int(pageNumbers[len(pageNumbers)-2].text)
return lastPageNumber
'''
Loops through pages gathering url from each page
'''
def getAllGoogleSearchResults(search, numResults):
baseUrl = "https://www.google.com/search?q=" + spacesToPluses(search)
basePage = getParsedGoogleResponse(baseUrl)
numPages = getNumPages(basePage)
allUrls = []
for i in range(0, numPages):
completeUrl = baseUrl + "&start=" + str(i * 10) #google uses the parameter 'start' to represent the url to start at (10 urls pre page)
page = getParsedGoogleResponse(completeUrl)
for url in getGoogleResultsOnPage(page):
allUrls.append(url)
return allUrls[0:numResults]#return just the number of results
def main():
print(getAllGoogleSearchResults("even another test", 1))
main()
The solution works for the first 10 pages (or next highest) of google results. The urls are returned in an array of string objects. The information is scrapped by getting the response using urllib2. Hope this helps.
Google search page has maximum 10 number of results to return(by default), parameter num in parameters dict is responsible for this:
params = {
"q": query, # query
"hl": "en", # language
"gl": "us", # country of the search, US -> USA
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
To get more data, you can paginate through all pages using an infinite while loop. Pagination is possible as long as the next button exists (determined by the presence of a button selector on the page, in our case the CSS selector .d6cvqb a[id=pnnext]), you need to increase the value of ["start"] by 10 to access the next page, if present, otherwise, we need to exit the while loop:
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
You also need to pay attention to the fact that most sites, including Google, do not like being scraped and the request might be blocked if you using requests as default user-agent in requests library is a python-requests. Additional step could be to rotate user-agent, for example, to switch between PC, mobile, and tablet, as well as between browsers e.g. Chrome, Firefox, Safari, Edge and so on.
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
query = input("Input your query: ") # for example: "auto"
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": query, # query
"hl": "en", # language
"gl": "us", # country of the search, US -> USA
"start": 0, # number page by default up to 0
#"num": 100 # parameter defines the maximum number of results to return.
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
page_num = 0
website_data = []
while True:
page_num += 1
print(f"page: {page_num}")
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
soup = BeautifulSoup(html.text, 'lxml')
for result in soup.select(".tF2Cxc"):
title = f'Title: {result.select_one("h3").text}'
link = f'Link: {result.select_one("a")["href"]}'
website_data.append({
"title" : title,
"link" : link,
})
if soup.select_one('.d6cvqb a[id=pnnext]'):
params["start"] += 10
else:
break
print(json.dumps(website_data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Title: Show Your Auto",
"link": "Link: http://www.showyourauto.com/vehicles/388/2002-ford-saleen-mustang-s281-extreme-speedster"
},
{
"title": "Title: Global Competition in the Auto Parts Industry: Hearings ...",
"link": "Link: https://books.google.com/books?id=dm7bjDjkrRQC&pg=PA2&lpg=PA2&dq=auto&source=bl&ots=sIf4ELozPN&sig=ACfU3U3xea1-cJl9hiQe8cpac2KLrIF20g&hl=en&sa=X&ved=2ahUKEwjWn7ukv6P7AhU3nGoFHSRxABY4jgIQ6AF6BAgEEAM"
},
{
"title": "Title: Issues relating to the domestic auto industry: hearings ...",
"link": "Link: https://books.google.com/books?id=fHX_MJobx3EC&pg=PA79&lpg=PA79&dq=auto&source=bl&ots=jcrwR-jwck&sig=ACfU3U0p0Wn6f-RU11U8Z0GtqMjTKd44ww&hl=en&sa=X&ved=2ahUKEwjWn7ukv6P7AhU3nGoFHSRxABY4jgIQ6AF6BAgaEAM"
},
# ...
]
You can also use Google Search Engine Results API from SerpApi. It's a paid API with the free plan.
The difference is that it will bypass blocks (including CAPTCHA) from Google, no need to create the parser and maintain it.
Code example:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import json, os
query = input("Input your query: ") # for example: "auto"
params = {
"api_key": os.getenv("API_KEY"), # serpapi key
"engine": "google", # serpapi parser engine
"q": query, # search query
"num": "100" # number of results per page (100 per page in this case)
# other search parameters: https://serpapi.com/search-api#api-parameters
}
search = GoogleSearch(params) # where data extraction happens
organic_results_data = []
page_num = 0
while True:
results = search.get_dict() # JSON -> Python dictionary
page_num += 1
for result in results["organic_results"]:
organic_results_data.append({
"page_num": page_num,
"title": result.get("title"),
"link": result.get("link"),
"displayed_link": result.get("displayed_link"),
})
if "next_link" in results.get("serpapi_pagination", []):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next_link")).query)))
else:
break
print(json.dumps(organic_results_data, indent=2, ensure_ascii=False))
Output:
[
{
"page_num": 4,
"title": "San Francisco's JFK Drive to Remain Closed to Cars",
"link": "https://www.nbcbayarea.com/decision-2022/jfk-drive-san-francisco-election/3073470/",
"displayed_link": "https://www.nbcbayarea.com › decision-2022 › jfk-driv..."
},
{
"page_num": 4,
"title": "Self-Driving Cruise Cars Are Expanding to Most of SF, Says ...",
"link": "https://sfstandard.com/business/self-driving-cruise-cars-are-expanding-to-most-of-sf-says-ceo/",
"displayed_link": "https://sfstandard.com › business › self-driving-cruise-c..."
},
# ...
]
I'm scraping from a google search but I can only get the first row of a two row chart on the right-hand side.
The search query is:
https://www.google.com/search?q=kegerators
I've noticed that doing an inspect element doesn't really work as beautifulsoup seems to extract a different code.
The code I have is:
htmltext=br.open(query).read()
soup=BeautifulSoup(htmltext)
search = soup.findAll("div", attrs={ "class" : "_cf" })
print search
Upon looking at the code (basically looking for "b>$" - as I know I should see 8 of those) I only get 4, which happen to be the top row of the chart.
These is the result of the search:
[<div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t3.gstatic.com/shopping?q=tbn:ANd9GcRY5NBoY-anFlJUYExmil81vJG5i1nw6LqVu64lSjw8tSPBUEdh3JaiFix-gfSKMGtE2ZwX8w&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">EdgeStar Ultra Low Temp F...</div><div><b>$599.00</b></div><div><cite style="white-space:nowrap">Kegerator</cite></div></div>, <div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t3.gstatic.com/shopping?q=tbn:ANd9GcRS4iCsD4EDV37Rg1kZf0nxFK3bYgYaWC-bxMv-ISg4dI8m-COU3ZHCZGs3FdJBK3npkpoE&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">Kegco K199SS‑2 D...</div><div><b>$539.99</b></div><div><cite style="white-space:nowrap">BeverageFa...</cite></div></div>, <div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t2.gstatic.com/shopping?q=tbn:ANd9GcSkf6-jVZt34pd_6QyqZGre06VxszvFZX70-wUOEDRhEFhorX_Yek0oyr-5jvk8FNpj2KWusQ&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">EdgeStar Ultra Low Temp F...</div><div><b>$499.00</b></div><div><cite style="white-space:nowrap">Compact Ap...</cite></div></div>, <div class="_cf" style="overflow:hidden"><span class="_vf" style="height:86px;width:86px"><span class="_uf"></span><img class="_wf" src="http://t1.gstatic.com/shopping?q=tbn:ANd9GcTf56EQ6DVbOk02D7cLgVmlurU-2gNrhD6a74MnzQBWg1W290DTYQuj0sSUxQEbxo1XO6pB&usqp=CAc"/></span><div style="height:2.4em;overflow:hidden">FunTime Black Kegge...</div><div><b>$399.99</b></div><div><cite style="white-space:nowrap">Max Tool</cite></div></div>]
Is Google doing something strange here?
The reason why results might differ is that Google displays different results on each request, e.g. sometimes it could get 10 shopping results, sometimes 7 or 4.
Specifying gl (country, e.g: us), hl (language, e.g: en) query params could get exact or close to the exact result that you see in your browser.
Also, don't forget to specify a user-agent, otherwise, Google will block your requests eventually.
Code and example in the online IDE:
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
"(KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "buy coffe", # intentional grammatical error to display right side shopping results
"hl": "en",
"gl": "us"
}
response = requests.get("https://www.google.com/search", headers=headers, params=params)
soup = BeautifulSoup(response.text, 'html.parser')
# scrapes both top and right side shopping resutls
for result in soup.select('.pla-hovercard-content-ellip'):
title = result.select_one('.pymv4e').text
link = result.select_one('.pla-hovercard-content-ellip a.tkXAec')['href']
ad_link = f"https://www.googleadservices.com/pagead{result.select_one('.pla-hovercard-content-ellip a')['href']}"
price = result.select_one('.qptdjc').text
try:
rating = result.select_one('.Fam1ne.tPhRLe')["aria-label"].replace("Rated ", "").replace(" out of ", "").replace(",", "")
except:
rating = None
try:
reviews = result.select_one('.GhQXkc').text.replace("(", "").replace(")", "")
except:
reviews = None
source = result.select_one('.zPEcBd.LnPkof').text.strip()
print(f'{title}\n{link}\n{ad_link}\n{price}\n{rating}\n{reviews}\n{source}\n')
-------------
'''
MUD\WTR | Mushroom Coffee Replacement, 90 servings
https://mudwtr.com/collections/shop/products/90-serving-bag
https://www.googleadservices.com/pagead/aclk?sa=l&ai=DChcSEwj5p8u-2rzyAhV2yJQJHfzhBoUYABAHGgJ5bQ&sig=AOD64_3NGBzLzkTv61K7kSrD2f9AREHH_g&ctype=5&q=&ved=2ahUKEwji7MK-2rzyAhWaaM0KHcnaDDcQ9aACegQIAhBo&adurl=
$125.00
4.85
1k+
mudwtr.com
...
'''
Alternatively, you can do the same thing using Google Inline Shopping API from SerpApi. It's a paid API with a free plan.
The difference is that everything is already extracted, and all that needs to be done is just to iterate over structured JSON.
Code to integrate:
import json, os
from serpapi import GoogleSearch
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "buy coffe",
"hl": "en",
"gl": "us",
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['shopping_results']:
print(json.dumps(result, indent=2, ensure_ascii=False))
--------
'''
{
"position": 1,
"block_position": "right",
"title": "Maxwell House Original Roast | 48oz",
"price": "$10.49",
"extracted_price": 10.49,
"link": "https://www.google.com/aclk?sa=l&ai=DChcSEwiGn8aT2rzyAhXgyZQJHZHdBJMYABAEGgJ5bQ&ae=2&sig=AOD64_0jBjdUIMeqJvrXYxn4NGcpwCYrJQ&ctype=5&q=&ved=2ahUKEwiOxLmT2rzyAhWiFVkFHWMNAaEQ5bgDegQIAhBa&adurl=",
"source": "Boxed",
"rating": 4.6,
"reviews": 2000,
"thumbnail": "https://serpapi.com/searches/611e1b2cfdca3e6a1c9335e6/images/e4ae7f31164ec52021f1c04d8be4e4bda2138b1acd12c868052125eb86ead292.png"
}
...
'''
P.S - I wrote a blog post about this topic that you can find here.
Disclaimer, I work for SerpApi.
I am using mechanize to perform a bing search and then I will process the results with beautiful soup. I have successfully performed google and yahoo searches with this same method but when I do a bing search all I get is a blank page.
I am thoroughly confused why this is the case and if anyone can shed any light on the matter that would be greatly appreciated. Here is a sample of the code I'm using:
from BeautifulSoup import BeautifulSoup
import mechanize
br = mechanize.Browser()
br.set_handle_robots(False)
br.open("http://www.bing.com/search?count=100&q=cheese")
content = br.response()
content = content.read()
soup = BeautifulSoup(content, convertEntities=BeautifulSoup.ALL_ENTITIES)
print soup
The result is a blank line printed.
You probably got response that answer is already in your browser cache. Try changing a little you query string, for example decrease count to 50.
You can also add some debugging code and see headers returned by server:
br.open("http://www.bing.com/search?count=50&q=cheese")
response = br.response()
headers = response.info()
print headers
content = response.read()
EDIT:
I have tried this query with count=100 with Firefox and Opera browsers and it seems that bing do not like such a "big" count. When I decrease count then it works. So this is not mechanize or other Python library fault, but your query is problematic to bing. It also seems that browser can query bing with count=100 but it must first query bing with some smaller count. Strange!
Another way to achieve this is by using requests with beautifulsoup
Code and example in online IDE:
from bs4 import BeautifulSoup
import requests, lxml, json
headers = {
'User-agent':
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
def get_organic_results():
html = requests.get('https://www.bing.com/search?q=nfs', headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
bing_data = []
for result in soup.find_all('li', class_='b_algo'):
title = result.h2.text
try:
link = result.h2.a['href']
except:
link = None
displayed_link = result.find('div', class_='b_attribution').text
try:
snippet = result.find('div', class_='b_caption').p.text
except:
snippet = None
for inline in soup.find_all('div', class_='b_factrow'):
try:
inline_title = inline.a.text
except:
inline_title = None
try:
inline_link = inline.a['href']
except:
inline_link = None
bing_data.append({
'title': title,
'link': link,
'displayed_link': displayed_link,
'snippet': snippet,
'inline': [{'title': inline_title, 'link': inline_link}]
})
print(json.dumps(bing_data, indent = 2))
# part of the created json output:
'''
[
{
"title": "Need for Speed Video Games - Official EA Site",
"link": "https://www.ea.com/games/need-for-speed",
"displayed_link": "https://www.ea.com/games/need-for-speed",
"snippet": "Need for Speed Forums Buy Now All Games Forums Buy Now Learn More Buy Now Hit the gas and tear up the roads in this legendary action-driving series. Push your supercar to its limits and leave the competition in your rearview or shake off a full-scale police pursuit \u2013 it\u2019s all just a key-turn away.",
"inline": [
{
"title": null,
"link": null
}
]
}
]
'''
Alternatively, you can do the same thing using Bing Organic Results API from SerpApi. It's a paid API with a free trial of 5,000 searches.
Code to integrate:
from serpapi import GoogleSearch
import os
def get_organic_results():
params = {
"api_key": os.getenv('API_KEY'),
"engine": "bing",
"q": "nfs most wanted"
}
search = GoogleSearch(params)
results = search.get_dict()
for result in results['organic_results']:
title = result['title']
link = result['link']
displayed_link = result['displayed_link']
try:
snippet = result['snippet']
except:
snippet = None
try:
inline = result['sitelinks']['inline']
except:
inline = None
print(f'{title}\n{link}\n{displayed_link}\n{snippet}\n{inline}\n')
# part of the output:
'''
Need for Speed: Most Wanted - Car Racing Game - Official ...
https://www.ea.com/games/need-for-speed/need-for-speed-most-wanted
https://www.ea.com/games/need-for-speed/need-for-speed-most-wanted
Jun 01, 2017 · To be Most Wanted, you’ll need to outrun the cops, outdrive your friends, and outsmart your rivals. With a relentless police force gunning to take you down, you’ll need to make split-second decisions. Use the open world to …
[{'title': 'Need for Speed No Limits', 'link': 'https://www.ea.com/games/need-for-speed/need-for-speed-no-limits'}, {'title': 'Buy Now', 'link': 'https://www.ea.com/games/need-for-speed/need-for-speed-heat/buy'}, {'title': 'Need for Speed Undercover', 'link': 'https://www.ea.com/games/need-for-speed/need-for-speed-undercover'}, {'title': 'Need for Speed The Run', 'link': 'https://www.ea.com/games/need-for-speed/need-for-speed-the-run'}, {'title': 'News', 'link': 'https://www.ea.com/games/need-for-speed/need-for-speed-payback/news'}]
'''
Disclaimer, I work for SerpApi.