Confused. Trying to build a scraper of UK news in python.
import feedparser
import pandas as pd
def poll_rss(rss_url):
feed = feedparser.parse(rss_url)
for entry in feed.entries:
print("Title:", entry.title)
print("Description:", entry.description)
print("\n")
def poll_rss(rss_url):
feed = feedparser.parse(rss_url)
for entry in feed.entries:
print("Title:", entry.title)
print("Description:", entry.description)
print("\n")
# Example usage:
feeds = [{"type": "news","title": "BBC", "url": "http://feeds.bbci.co.uk/news/uk/rss.xml"},
{"type": "news","title": "The Economist", "url": "https://www.economist.com/international/rss.xml"},
{"type": "news","title": "The New Statesman", "url": "https://www.newstatesman.com/feed"},
{"type": "news","title": "The New York Times", "url": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"},
{"type": "news","title": "Metro UK","url": "https://metro.co.uk/feed/"},
{"type": "news", "title": "Evening Standard", "url": "https://www.standard.co.uk/rss.xml"},
{"type": "news","title": "Daily Mail", "url": "https://www.dailymail.co.uk/articles.rss"},
{"type": "news","title": "Sky News", "url": "https://news.sky.com/feeds/rss/home.xml"},
{"type": "news", "title": "The Mirror", "url": "https://www.mirror.co.uk/news/?service=rss"},
{"type": "news", "title": "The Sun", "url": "https://www.thesun.co.uk/news/feed/"},
{"type": "news", "title": "Sky News", "url": "https://news.sky.com/feeds/rss/home.xml"},
{"type": "news", "title": "The Guardian", "url": "https://www.theguardian.com/uk/rss"},
{"type": "news", "title": "The Independent", "url": "https://www.independent.co.uk/news/uk/rss"},
{"type": "news", "title": "The Telegraph", "url": "https://www.telegraph.co.uk/news/rss.xml"},
{"type": "news", "title": "The Times", "url": "https://www.thetimes.co.uk/?service=rss"},
{"type": "news", "title": "The Mirror", "url": "https://www.mirror.co.uk/news/rss.xml"}]
for feed in feeds:
parsed_feed = feedparser.parse(feed['url'])
print("Title:", feed['title'])
print("Number of Articles:", len(parsed_feed.entries))
print("\n")
data = []
for entry in parsed_feed.entries:
title = entry.title
url = entry.link
print(entry.summary)
if entry.summary:
summary = entry.summary
data.append(summary)
else:
entry.summary = "No summary available"
if entry.published:
date = entry.published
data.append (data)
else:
data.append("No data available")
I then have a bit of code to sort out the saving.
df = pd.DataFrame(data)
df.columns = ['title', 'url', 'summary', 'date']
print("data" + df)
from sqlalchemy import create_engine
import mysql.connector
engine = create_engine('mysql+pymysql://root:password_thingbob#localhost/somedatabase')
df.to_sql('nationals', con = engine, if_exists = 'append', index = False)
Although the nationals table has been created and the credentials are right, why does it not save?
If the credentials are correct as you say, then the to_sql call is fine. I think the problem is the Python loop to parse the feed. In particular, the line data.append (data) is creating a recursive list that cannot be constructed into a dataframe. Also, I think data list should be a nested list where each sub-list is an entry in a parsed_feed (so that each row in the dataframe is one entry).
I would write the loop as
data = [] # <---- initialize empty list here
for feed in feeds:
parsed_feed = feedparser.parse(feed['url'])
print("Title:", feed['title'])
print("Number of Articles:", len(parsed_feed.entries))
print("\n")
for entry in parsed_feed.entries:
title = entry.title
url = entry.link
print(entry.summary)
summary = entry.summary or "No summary available" # I simplified the ternary operators here
date = entry.published or "No data available" # I simplified the ternary operators here
data.append([title, url, summary, date]) # <---- append data from each entry here
df = pd.DataFrame(data, columns = ['title', 'url', 'summary', 'date'])
from sqlalchemy import create_engine
import mysql.connector
engine = create_engine('mysql+pymysql://root:password_thingbob#localhost/somedatabase')
df.to_sql('nationals', con = engine, if_exists = 'append', index = False)
I checked it with the feed list you provided and it works fine.
Since RSS feeds are XML files, consider pandas.read_xml and bind data via a list comprehension which avoids the bookkeeping of initializing list and appending elements.
Additionally, process each feed via a user-defined method and since you are scrapping potential web links that can change incorporate try...except which shows three problematic URLs in your post.
import pandas as pd
feeds = [
{"type": "news", "title": "BBC", "url": "http://feeds.bbci.co.uk/news/uk/rss.xml"},
{"type": "news", "title": "The Economist", "url": "https://www.economist.com/international/rss.xml"},
{"type": "news", "title": "The New Statesman", "url": "https://www.newstatesman.com/feed"},
{"type": "news", "title": "The New York Times", "url": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"},
{"type": "news", "title": "Metro UK", "url": "https://metro.co.uk/feed/"},
{"type": "news", "title": "Evening Standard", "url": "https://www.standard.co.uk/rss"}, # FIXED URL: REMOVE .xml
{"type": "news", "title": "Daily Mail", "url": "https://www.dailymail.co.uk/articles.rss"},
{"type": "news", "title": "Sky News", "url": "https://news.sky.com/feeds/rss/home.xml"}, # PROBLEM URL
{"type": "news", "title": "The Mirror", "url": "https://www.mirror.co.uk/news/?service=rss"},
{"type": "news", "title": "The Sun", "url": "https://www.thesun.co.uk/news/feed/"},
{"type": "news", "title": "Sky News", "url": "https://news.sky.com/feeds/rss/home.xml"}, # PROBLEM URL
{"type": "news", "title": "The Guardian", "url": "https://www.theguardian.com/uk/rss"},
{"type": "news", "title": "The Independent", "url": "https://www.independent.co.uk/news/uk/rss"},
{"type": "news", "title": "The Telegraph", "url": "https://www.telegraph.co.uk/news/rss.xml"},
{"type": "news", "title": "The Times", "url": "https://www.thetimes.co.uk/?service=rss"}, # PROBLEM URL
{"type": "news", "title": "The Mirror", "url": "https://www.mirror.co.uk/news/rss.xml"}
]
hdr = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'
}
def proc_rss(feed):
rss_df = None
print("Title:", feed['title'])
try:
# PARSE RSS XML W/ HEADERS, KEEP SPECIFIC COLUMNS, RENAME COLUMNS
rss_df = (
pd.read_xml(feed["url"], xpath=".//item", storage_options=hdr)
.reindex(["title", "link", "description", "pubDate"], axis="columns")
.set_axis(["title", "url", "summary", "date"], axis="columns")
)
print("Number of Articles:", rss_df.shape[0])
except Exception as e:
print("Number of Articles: NONE. Reason:", e)
print("")
return rss_df
# LIST COMPREHENSION BINDED TO SINGLE DATA FRAME
rss_df = pd.concat([proc_rss(f) for f in feeds], ignore_index=True)
print(rss_df)
Output
Title: BBC
Number of Articles: 34
Title: The Economist
Number of Articles: 100
Title: The New Statesman
Number of Articles: 20
Title: The New York Times
Number of Articles: 27
Title: Metro UK
Number of Articles: 30
Title: Evening Standard
Number of Articles: 100
Title: Daily Mail
Number of Articles: 153
Title: Sky News
Number of Articles: NONE. Reason: HTTP Error 404: Not Found
Title: The Mirror
Number of Articles: 25
Title: The Sun
Number of Articles: 100
Title: Sky News
Number of Articles: NONE. Reason: HTTP Error 404: Not Found
Title: The Guardian
Number of Articles: 113
Title: The Independent
Number of Articles: 100
Title: The Telegraph
Number of Articles: 100
Title: The Times
Number of Articles: NONE. Reason: xmlParseEntityRef: no name, line 1, column 1556 (<string>, line 1)
Title: The Mirror
Number of Articles: 25
title url summary date
0 Nicola Bulley: Lancashire Police find body in ... https://www.bbc.co.uk/news/uk-england-64697300... Officers searching for the missing mother-of t... Sun, 19 Feb 2023 17:54:18 GMT
1 Baftas 2023: All Quiet on the Western Front do... https://www.bbc.co.uk/news/entertainment-arts-... Netflix's World War One epic won best film and... Sun, 19 Feb 2023 23:12:05 GMT
2 Dickie Davies, host of ITV's World of Sport fo... https://www.bbc.co.uk/news/uk-england-lancashi... The presenter anchored the five-hour live TV m... Mon, 20 Feb 2023 00:47:00 GMT
3 Son Heung-min: Tottenham condemn 'utterly repr... https://www.bbc.co.uk/sport/football/64700428?... Tottenham call for social media companies to t... Sun, 19 Feb 2023 22:25:04 GMT
4 Argentina Open: British number one Cameron Nor... https://www.bbc.co.uk/sport/tennis/64700048?at... British number one Cameron Norrie misses out o... Sun, 19 Feb 2023 21:45:24 GMT
.. ... ... ... ...
922 Nicola Bulley's family 'incredibly heartbroken... https://www.mirror.co.uk/news/uk-news/breaking... Lancashire Police has recovered a body around ... Sun, 19 Feb 2023 19:51:09 +0000
923 Shamed Matt Hancock gets 'worked like a barbec... https://www.mirror.co.uk/tv/tv-news/shamed-mat... SAS: Who Dares Wins star Rudy Reyessays shamed... Sun, 19 Feb 2023 19:35:03 +0000
924 Treasure hunter uses map left by his father to... https://www.mirror.co.uk/news/world-news/treas... Jan Glazewski dug up the silver treasure burie... Sun, 19 Feb 2023 19:19:15 +0000
925 'My husband refuses to be in the delivery room... https://www.mirror.co.uk/news/weird-news/my-hu... A first-time mum-to-be says she's now feeling ... Sun, 19 Feb 2023 19:17:34 +0000
926 Nicola Bulley search diver sends message of su... https://www.mirror.co.uk/news/uk-news/nicola-b... The expert search diver called in to assist wi... Sun, 19 Feb 2023 19:16:13 +0000
[927 rows x 4 columns]
Related
I am trying to write a .jsonl file that needs to look like this:
{"file_name": "0001.png", "text": "This is a golden retriever playing with a ball"}
{"file_name": "0002.png", "text": "A german shepherd"}
{"file_name": "0003.png", "text": "One chihuahua"}
This is my attempt:
import json
import pandas as pd
dt = pd.read_csv('data.csv')
df = pd.DataFrame(dt)
file_name = df['image']
file_caption = df['text']
data = []
for i in range(len(file_name)):
entry = {"file_name": file_name[i], "text": file_caption[i]}
data.append(entry)
json_object = json.dumps(data, indent=4)
# Writing to sample.json
with open("metadata.jsonl", "w") as outfile:
outfile.write(json_object)
But this is the output I get:
[
{
"file_name": "images/image_0.jpg",
"text": "Fattoush Salad with Roasted Potatoes"
},
{
"file_name": "images/image_1.jpg",
"text": "an analysis of self portrayal in novels by virginia woolf A room of one's own study guide contains a biography of virginia woolf, literature essays, quiz questions, major themes, characters, and a full summary and analysis about a room of one's own a room of one's own summary."
},
{
"file_name": "images/image_2.jpg",
"text": "Christmas Comes Early to U.K. Weekly Home Entertainment Chart"
},
{
"file_name": "images/image_3.jpg",
"text": "Amy Garcia Wikipedia a legacy of reform: dorothea dix (1802\u20131887) | states of"
},
{
"file_name": "images/image_4.jpg",
"text": "3D Metal Cornish Harbour Painting"
},
{
"file_name": "images/image_5.jpg",
"text": "\"In this undated photo provided by the New York City Ballet, Robert Fairchild performs in \"\"In Creases\"\" by choreographer Justin Peck which is being performed by the New York City Ballet in New York. (AP Photo/New York City Ballet, Paul Kolnik)\""
},
...
]
I know that its because I am dumping a list so I know where I'm going wrong but how do I create a .jsonl file like the format above?
Don't indent the generated JSON and don't append it to a list. Just write out each line to the file:
import json
import pandas as pd
df = pd.DataFrame([['0001.png', "This is a golden retriever playing with a ball"],
['0002.png', "A german shepherd"],
['0003.png', "One chihuahua"]], columns=['filename','text'])
with open("metadata.jsonl", "w") as outfile:
for file, caption in zip(df['filename'], df['text']):
entry = {"file_name": file, "text": caption}
print(json.dumps(entry), file=outfile)
Output:
{"file_name": "0001.png", "text": "This is a golden retriever playing with a ball"}
{"file_name": "0002.png", "text": "A german shepherd"}
{"file_name": "0003.png", "text": "One chihuahua"}
I want to use Scrapy to extract the titles of different books in a url and output/store them as an array of dictionaries in a json file.
Here is my code:
import scrapy
class BooksSpider(scrapy.Spider):
name = "books"
star_urls = [
"http://books.toscrape.com"
]
def parse(self, response):
titles = response.css("article.product_pod h3 a::attr(title)").getall()
for title in titles:
yield {"title": title}
Here is what i put in the terminal:
scrapy crawl books -o books.json
The books.json file is created but is empty.
I checked that I was in the right directory and venv but it still doesn't work.
However:
Earlier, i deployed this spider to scrape the whole html data and write it to a books.html file and everything worked.
Here is my code for this:
import scrapy
class BooksSpider(scrapy.Spider):
name = "books"
star_urls = [
"http://books.toscrape.com"
]
def parse(self, response):
with open("books.html", "wb") as file:
file.write(response.body)
and here is what I put in my terminal:
scrapy crawl books
Any ideas on what I'm doing wrong? Thanks
Edit:
inputing response.css('article.product_pod h3 a::attr(title)').getall()
into the scrapy shell outputs:
['A Light in the Attic', 'Tipping the Velvet', 'Soumission', 'Sharp Objects', 'Sapiens: A Brief History of Humankind', 'The Requiem Red', 'The Dirty Little Secrets of Getting Your Dream Job', 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'The Black Maria', 'Starving Hearts (Triangular Trade Trilogy, #1)', "Shakespeare's Sonnets", 'Set Me Free', "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", 'Rip it Up and Start Again', 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991', 'Olio', 'Mesaerion: The Best Science Fiction Stories 1800-1849', 'Libertarianism for Beginners', "It's Only the Himalayas"]
Now run the code.It should work
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
titles = response.css('.product_pod')
for title in titles:
yield {
"title": title.css('h3 a::attr(title)').get()
#"title": title.css('h3 a::text').get()
}
Output:
[
{
"title": "A Light in the Attic"
},
{
"title": "Tipping the Velvet"
},
{
"title": "Soumission"
},
{
"title": "Sharp Objects"
},
{
"title": "Sapiens: A Brief History of Humankind"
},
{
"title": "The Requiem Red"
},
{
"title": "The Dirty Little Secrets of Getting Your Dream Job"
},
{
"title": "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull"
},
{
"title": "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics"
},
{
"title": "The Black Maria"
},
{
"title": "Starving Hearts (Triangular Trade Trilogy, #1)"
},
{
"title": "Shakespeare's Sonnets"
},
{
"title": "Set Me Free"
},
{
"title": "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)"
},
{
"title": "Rip it Up and Start Again"
},
{
"title": "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991"
},
{
"title": "Olio"
},
{
"title": "Mesaerion: The Best Science Fiction Stories 1800-1849"
},
{
"title": "Libertarianism for Beginners"
},
{
"title": "It's Only the Himalayas"
}
]
how can i return or select only those parameters that are needed in Python dict format. Not all of the parameters that are being return.
Here is the url we use:
https://api.nytimes.com/svc/search/v2/articlesearch.json?begin_date=20201020&facet=false&sort=newest&api-key=[YOUR_API_KEY]
Here is the response we get:
{
"status": "OK",
"copyright": "Copyright (c) 2020 The New York Times Company. All Rights Reserved.",
"response": {
"docs": [
{
"abstract": "Our latest survey shows a shift toward Biden among college-educated white voters, but surprising Trump gains among nonwhite voters.",
"web_url": "https://www.nytimes.com/2020/10/20/upshot/poll-georgia-biden-trump.html",
"snippet": "Our latest survey shows a shift toward Biden among college-educated white voters, but surprising Trump gains among nonwhite voters.",
"lead_paragraph": "A shift against President Trump among white college-educated voters in Georgia has imperiled Republicans up and down the ballot, according to a New York Times/Siena College survey on Tuesday, as Republicans find themselves deadlocked or trailing in Senate races where their party was once considered the heavy favorite.",
"source": "The New York Times",
"multimedia": [
{
"rank": 0,
"subtype": "xlarge",
"caption": null,
"credit": null,
"type": "image",
"url": "images/2020/10/20/us/undefined-promo-1603200878027/undefined-promo-1603200878027-articleLarge.jpg",
"height": 399,
"width": 600,
"legacy": {
"xlarge": "images/2020/10/20/us/undefined-promo-1603200878027/undefined-promo-1603200878027-articleLarge.jpg",
"xlargewidth": 600,
"xlargeheight": 399
},
"subType": "xlarge",
"crop_name": "articleLarge"
},
..........
How can i only return for example:
web_url and source parameters in Python?
Please help !!!
this is the code i use, but it returns all parameters:
import requests
import os
from pprint import pprint
apikey = os.getenv('VGSDRL9bWiWy70GdCPA4QX8flAsemVGJ', '...')
query_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json?q=trump&sort=newest&api-key=VGSDRL9bWiWy70GdCPA4QX8flAsemVGJ"
r = requests.get(query_url)
pprint(r.json())
r = requests.get(query_url)
filtered = [{'web_url': d['web_url'], 'source': d['source']} for d in
r.json()['response']['docs']]
pprint(filtered)
I have a response that I receive from Lobbyview in the form of json. I tried to put it in data frame to access only some variables, but with no success. How can I access only some variables such as the id and the committees in a format exportable to .dta ? Here is the code I have tried.
import requests, json
query = {"naics": "424430"}
results = requests.post('https://www.lobbyview.org/public/api/reports',
data = json.dumps(query))
print(results.json())
import pandas as pd
b = pd.DataFrame(results.json())
_id = data["_id"]
committee = data["_source"]["specific_issues"][0]["bills_by_algo"][0]["committees"]
An observation of the json looks like this:
"_score": 4.421936,
"_type": "object",
"_id": "5EZUMbQp3hGKH8Uq2Vxuke",
"_source":
{
"issue_codes": ["CPT"],
"received": 1214320148,
"client_name": "INTELLECTUAL PROPERTY OWNERS ASSOCIATION",
"amount": 240000,
"client":
{
"legal_name": "INTELLECTUAL PROPERTY OWNERS ASSOCIATION",
"name": "INTELLECTUAL PROPERTY OWNERS ASSOCIATION",
"naics": null,
"gvkey": null,
"ticker": "Unlisted",
"id": null,
"bvdid": "US131283992L"},
"specific_issues": [
{
"text": "H.R. 34, H.R. 1908, H.R. 2336, H.R. 3093 S. 522, S. 681, S. 1145, S. 1745",
"bills_by_algo": [
{
"titles": ["To amend title 35, United States Code, to provide for patent reform.", "Patent Reform Act of 2007", "Patent Reform Act of 2007", "Patent Reform Act of 2007"],
"top_terms": ["Commerce", "Administrative fees"],
"sponsor":
{
"firstname": "Howard",
"district": 28,
"title": "rep",
"id": 400025
},
"committees": ["House Judiciary"],
"introduced": 1176868800,
"type": "HR", "id": "110_HR1908"},
{
"titles": ["To amend title 35, United States Code, relating to the funding of the United States Patent and Trademark Office."],
"top_terms": ["Commerce", "Administrative fees"],
"sponsor":
{
"firstname": "Howard",
"district": 28,
"title": "rep",
"id": 400025
},
"committees": ["House Judiciary"],
"introduced": 1179288000,
"type": "HR",
"id": "110_HR2336"
}],
"gov_entities": ["U.S. House of Representatives", "Patent and Trademark Office (USPTO)", "U.S. Senate", "UNDETERMINED", "U.S. Trade Representative (USTR)"],
"lobbyists": ["Valente, Thomas Silvio", "Wamsley, Herbert C"],
"year": 2007,
"issue": "CPT",
"id": "S4nijtRn9Q5NACAmbqFjvZ"}],
"year": 2007,
"is_latest_amendment": true,
"type": "MID-YEAR AMENDMENT",
"id": "1466CDCD-BA3D-41CE-B7A1-F9566573611A",
"alternate_name": "INTELLECTUAL PROPERTY OWNERS ASSOCIATION"
},
"_index": "collapsed"}```
Since the data that you specified is nested pretty deeply in the JSON-response, you have to loop through it and save it to a list temporarily. To understand the response data better, I would advice you to use some tool to look into the JSON structure, like this online JSON-Viewer. Not every entry in the JSON contains the necessary data, therefore I try to catch the error through a try and except. To make sure that the id and committees are matched correctly, I chose to add them as small dicts to the list. This list can then be read into Pandas with ease. Saving to .dta requires you to convert the lists inside the committees column to strings, instead you might also want to save as .csv for a more generally usable format.
import requests, json
import pandas as pd
query = {"naics": "424430"}
results = requests.post(
"https://www.lobbyview.org/public/api/reports", data=json.dumps(query)
)
json_response = results.json()["result"]
# to save the JSON response
# with open("data.json", "w") as outfile:
# json.dump(results.json()["result"], outfile)
resulting_data = []
# loop through the response
for data in json_response:
# try to find entries with specific issues, bills_by_algo and committees
try:
# loop through the special issues
for special_issue in data["specific_issues"]:
_id = special_issue["id"]
# loop through the bills_by_algo's
for x in special_issue["bills_by_algo"]:
# append the id and committees in a dict
resulting_data.append(({"id": _id, "committees": x["committees"]}))
except KeyError as e:
print(e, "not found in entry.")
continue
# create a DataFrame
df = pd.DataFrame(resulting_data)
# export of list objects in the column is not supported by .dta, therefore we convert
# to strings with ";" as delimiter
df["committees"] = ["; ".join(map(str, l)) for l in df["committees"]]
print(df)
df.to_stata("result.dta")
Results in
id committees
0 D8BxG5664FFb8AVc6KTphJ House Judiciary
1 D8BxG5664FFb8AVc6KTphJ Senate Judiciary
2 8XQE5wu3mU7qvVPDpUWaGP House Agriculture
3 8XQE5wu3mU7qvVPDpUWaGP Senate Agriculture, Nutrition, and Forestry
4 kzZRLAHdMK4YCUQtQAdCPY House Agriculture
.. ... ...
406 ZxXooeLGVAKec9W2i32hL5 House Agriculture
407 ZxXooeLGVAKec9W2i32hL5 Senate Agriculture, Nutrition, and Forestry; H...
408 ZxXooeLGVAKec9W2i32hL5 House Appropriations; Senate Appropriations
409 ahmmafKLfRP8wZay9o8GRf House Agriculture
410 ahmmafKLfRP8wZay9o8GRf Senate Agriculture, Nutrition, and Forestry
[411 rows x 2 columns]
Hello fellow developer out there,
I'm new to Python & I need to write a web scraper to catch info from Scholar Google.
I ended up coding this function to get values using Xpath:
thread = browser.find_elements(By.XPATH,(" %s" % exp))
xArray = []
for t in thread:
if not atr:
xThread = t.text
else:
xThread = t.get_attribute('href')
xArray.append(xThread)
return xArray
I don't know if it's a good or a bad solution. So, I humbly accept any suggestions to make it work better.
Anyway, my actual problem is that I am getting all authors name from the page I am scraping and what I really need are the names, grouped by result.
When I ask to print the results I wish I could have something like this:
[[author1, author2,author 3],[author 4,author 5,author6]]
What am I getting right now is:
[author1,author3,author4,author5,author6]
The structure is as follows:
<div class="gs_a">
LR Hisch,
AM Gobin
,AR Lowery,
F Tam
... -Annals of biomedical ...,2006 - Springer
</div>
And the same structure is repetead all over the page for different documents and authors.
And this is the call to the function I explained earlier:
authors = (clothoSpins(".//*[#class='gs_a']//a"))
Which gets me the entire list of authors.
Here is the logic (used selenium in the below code but update it as per your need).
Logic:
url = "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C21&q=python&btnG="
driver.get(url)
# get the authors and add to list
listBooks = []
books = driver.find_elements_by_xpath("//div[#class='gs_a']")
for bookNum in books:
auths = []
authors = driver.find_elements_by_xpath("(//div[#class='gs_a'])[%s]/a|(//div[#class='gs_a'])[%s]/self::*[not(a)]"%(bookNum+1,bookNum+1))
for author in authors:
auths.append(author.text)
listBooks.append(auths)
Output:
[['F Pedregosa', 'G Varoquaux', 'A Gramfort'], ['PD Adams', 'PV Afonine'], ['TE Oliphant'], ['JW Peirce'], ['S Anders', 'PT Pyl', 'W Huber'], ['MF Sanner'], ['S Bird', 'E Klein'], ['M Lutz - 2001 - books.google.com'], ['G Rossum - 1995 - dl.acm.org'], ['W McKinney - … of the 9th Python in Science Conference, 2010 - pdfs.semanticscholar.org']]
Screenshot:
To group by result you can create an empty list, iterate over results, and append extracted data to the list as a dict, and returned result could be serialized to a JSON string using json_dumps() method e.g:
temp_list = []
for result in results:
# extracting title, link, etc.
temp_list.append({
"title": title,
# other extracted elements
})
print(json.dumps(temp_list, indent=2))
"""
Returned results is a list of dictionaries:
[
{
"title": "A new biology for a new century",
# other extracted elements..
}
]
"""
Code and full example in the online IDE:
from parsel import Selector
import requests, json, re
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "biology", # search query
"hl": "en" # language
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.87 Safari/537.36",
}
html = requests.get("https://scholar.google.com/scholar", params=params, headers=headers, timeout=30)
selector = Selector(html.text)
data = []
for result in selector.css(".gs_ri"):
# xpath("normalize-space()") to get blank text nodes as well to get the full string output
title = result.css(".gs_rt a").xpath("normalize-space()").get()
# https://regex101.com/r/7bmx8h/1
authors = re.search(r"^(.*?)-", result.css(".gs_a").xpath("normalize-space()").get()).group(1).strip()
snippet = result.css(".gs_rs").xpath("normalize-space()").get()
# https://regex101.com/r/47erNR/1
year = re.search(r"\d+", result.css(".gs_a").xpath("normalize-space()").get()).group(0)
# https://regex101.com/r/13468d/1
publisher = re.search(r"\d+\s?-\s?(.*)", result.css(".gs_a").xpath("normalize-space()").get()).group(1)
cited_by = int(re.search(r"\d+", result.css(".gs_or_btn.gs_nph+ a::text").get()).group(0))
data.append({
"title": title,
"snippet": snippet,
"authors": authors,
"year": year,
"publisher": publisher,
"cited_by": cited_by
})
print(json.dumps(data, indent=2, ensure_ascii=False))
Output:
[
{
"title": "A new biology for a new century",
"snippet": "… A society that permits biology to become an engineering discipline, that allows that science … science of biology that helps us to do this, shows the way. An engineering biology might still …",
"authors": "CR Woese",
"year": "2004",
"publisher": "Am Soc Microbiol",
"cited_by": 743
}, ... other results
{
"title": "Campbell biology",
"snippet": "… Now, Campbell series Biology texts are institutionalized. This is the standard biology text across colleges in the US To say the authors and editors know what they are doing at this point …",
"authors": "JB Reece, LA Urry, ML Cain, SA Wasserman…",
"year": "2014",
"publisher": "fvsuol4ed.org",
"cited_by": 1184
}
]
Note: in the example above, I'm using parsel library which is very similar to beautifulsoup and selenium in terms of data extraction.
Alternatively, you can achieve the same thing by using Google Scholar Organic Results API from SerpApi. It's a paid API with a free plan.
The difference is that you don't have to create the parser from scratch, maintain it, figure out how to scale it without getting blocked.
Example code to integrate:
from serpapi import GoogleSearch
import os, json
params = {
"api_key": os.getenv("API_KEY"), # SerpApi API key
"engine": "google_scholar", # parsing engine
"q": "biology", # search query
"hl": "en" # language
}
search = GoogleSearch(params) # where data extraction happens
results = search.get_dict() # JSON -> Python dictionary
for result in results["organic_results"]:
print(json.dumps(result, indent=2))
Output:
{
"position": 0,
"title": "A new biology for a new century",
"result_id": "KNJ0p4CbwgoJ",
"link": "https://journals.asm.org/doi/abs/10.1128/MMBR.68.2.173-186.2004",
"snippet": "\u2026 A society that permits biology to become an engineering discipline, that allows that science \u2026 science of biology that helps us to do this, shows the way. An engineering biology might still \u2026",
"publication_info": {
"summary": "CR Woese - Microbiology and molecular biology reviews, 2004 - Am Soc Microbiol"
},
"resources": [
{
"title": "nih.gov",
"file_format": "HTML",
"link": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC419918/"
},
{
"title": "View it # CTU",
"link": "https://scholar.google.com/scholar?output=instlink&q=info:KNJ0p4CbwgoJ:scholar.google.com/&hl=en&as_sdt=0,11&scillfp=15047057806408271473&oi=lle"
}
],
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=KNJ0p4CbwgoJ",
"html_version": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC419918/",
"cited_by": {
"total": 743,
"link": "https://scholar.google.com/scholar?cites=775353062728716840&as_sdt=80005&sciodt=0,11&hl=en",
"cites_id": "775353062728716840",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=80005&cites=775353062728716840&engine=google_scholar&hl=en"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:KNJ0p4CbwgoJ:scholar.google.com/&scioq=biology&hl=en&as_sdt=0,11",
"versions": {
"total": 20,
"link": "https://scholar.google.com/scholar?cluster=775353062728716840&hl=en&as_sdt=0,11",
"cluster_id": "775353062728716840",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C11&cluster=775353062728716840&engine=google_scholar&hl=en"
}
}
}
{
"position": 9,
"title": "Campbell biology",
"result_id": "YnWp49O_RTMJ",
"type": "Book",
"link": "http://www.fvsuol4ed.org/reviews/Biology%20Organismal%20Template_Campbell%20Biology_Moran.pdf",
"snippet": "\u2026 Now, Campbell series Biology texts are institutionalized. This is the standard biology text across colleges in the US To say the authors and editors know what they are doing at this point \u2026",
"publication_info": {
"summary": "JB Reece, LA Urry, ML Cain, SA Wasserman\u2026 - 2014 - fvsuol4ed.org"
},
"resources": [
{
"title": "fvsuol4ed.org",
"file_format": "PDF",
"link": "http://www.fvsuol4ed.org/reviews/Biology%20Organismal%20Template_Campbell%20Biology_Moran.pdf"
}
],
"inline_links": {
"serpapi_cite_link": "https://serpapi.com/search.json?engine=google_scholar_cite&q=YnWp49O_RTMJ",
"cited_by": {
"total": 1184,
"link": "https://scholar.google.com/scholar?cites=3694569986105898338&as_sdt=80005&sciodt=0,11&hl=en",
"cites_id": "3694569986105898338",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=80005&cites=3694569986105898338&engine=google_scholar&hl=en"
},
"related_pages_link": "https://scholar.google.com/scholar?q=related:YnWp49O_RTMJ:scholar.google.com/&scioq=biology&hl=en&as_sdt=0,11",
"versions": {
"total": 33,
"link": "https://scholar.google.com/scholar?cluster=3694569986105898338&hl=en&as_sdt=0,11",
"cluster_id": "3694569986105898338",
"serpapi_scholar_link": "https://serpapi.com/search.json?as_sdt=0%2C11&cluster=3694569986105898338&engine=google_scholar&hl=en"
},
"cached_page_link": "http://scholar.googleusercontent.com/scholar?q=cache:YnWp49O_RTMJ:scholar.google.com/+biology&hl=en&as_sdt=0,11"
}
}
If you need to parse data from all Google Scholar Organic results, there's a dedicated Scrape historic 2017-2021 Organic, Cite Google Scholar results to CSV, SQLite blog post of mine at SerpApi that shows how to do it with API.
Disclaimer, I work for SerpApi.