I like to retrieve information from NewsApi and ran into an issue. Enclosed the code:
from NewsApi import NewsApi
import pandas as pd
import os
import datetime as dt
from datetime import date
def CreateDF(JsonArray,columns):
dfData = pd.DataFrame()
for item in JsonArray:
itemStruct = {}
for cunColumn in columns:
itemStruct[cunColumn] = item[cunColumn]
# dfData = dfData.append(itemStruct,ignore_index=True)
# dfData = dfData.append({'id': item['id'], 'name': item['name'], 'description': item['description']},
# ignore_index=True)
# return dfData
return itemStruct
def main():
# access_token_NewsAPI.txt must contain your personal access token
with open("access_token_NewsAPI.txt", "r") as f:
myKey = f.read()[:-1]
#myKey = 'a847cee6cc254d8495632f83d5c77d39'
api = NewsApi(myKey)
# get sources of news
# columns = ['id', 'name', 'description']
# rst_source = api.GetSources()
# df = CreateDF(rst_source['sources'], columns)
# df.to_csv('source_list.csv')
#
#
# # get news for specific country
# rst_country = api.GetHeadlines()
# columns = ['author', 'publishedAt', 'title', 'description','content', 'url']
# df = CreateDF(rst_country['articles'], columns)
# df.to_csv('Headlines_country.csv')
# get news for specific symbol
symbol = "coronavirus"
sources = 'bbc.co.uk'
columns = ['author', 'publishedAt', 'title', 'description', 'content', 'source']
limit = 500 # maximum requests per day
i = 1
startDate = dt.datetime(2020, 3, 1, 8)
# startDate = dt.datetime(2020, 3, 1)
df = pd.DataFrame({'author': [], 'publishedAt': [], 'title': [], 'description': [], 'content':[], 'source': []})
while i < limit:
endDate = startDate + dt.timedelta(hours=2)
rst_symbol = api.GetEverything(symbol, 'en', startDate, endDate, sources)
rst = CreateDF(rst_symbol['articles'], columns)
df = df.append(rst, ignore_index=True)
# DF.join(df.set_index('publishedAt'), on='publishedAt')
startDate = endDate
i += 1
df.to_csv('Headlines_symbol.csv')
main()
I got following error:
rst = CreateDF(rst_symbol['articles'], columns)
KeyError: 'articles'
In this line:
rst = CreateDF(rst_symbol['articles'], columns)
I think there is some problem regarding the key not being found or defined - does anyone has an idea how to fix that? I'm thankful for every hint!
MAiniak
EDIT:
I found the solution after I tried a few of your hints. Apparently, the error occurred when the NewsAPI API key ran into a request limit. This happened every time, until I changed the limit = 500 to limit = 20. For some reason, there is no error with a new API Key and reduced limit.
Thanks for your help guys!
Probably 'articles' is not one of your columns in rst_symbol object.
The python documentation [2] [3] doesn't mention any method named NewsApi() or GetEverything(), but rather NewsApiClient() and get_everything(), i.e.:
from newsapi import NewsApiClient
# Init
newsapi = NewsApiClient(api_key='xxx')
# /v2/top-headlines
top_headlines = newsapi.get_top_headlines(q='bitcoin',
sources='bbc-news,the-verge',
category='business',
language='en',
country='us')
# /v2/everything
all_articles = newsapi.get_everything(q='bitcoin',
sources='bbc-news,the-verge',
domains='bbc.co.uk,techcrunch.com',
from_param='2017-12-01',
to='2017-12-12',
language='en',
sort_by='relevancy',
page=2)
# /v2/sources
sources = newsapi.get_sources()
Related
I'm working on a personal project and I'm trying to retrieve air quality data from the https://aqicn.org website using their API.
I've used this code, which I've copied and adapted for the city of Bucharest as follows:
import pandas as pd
import folium
import requests
# GET data from AQI website through the API
base_url = "https://api.waqi.info"
path_to_file = "~/path"
# Got token from:- https://aqicn.org/data-platform/token/#/
with open(path_to_file) as f:
contents = f.readlines()
key = contents[0]
# (lat, long)-> bottom left, (lat, lon)-> top right
latlngbox = "44.300264,25.920181,44.566991,26.297836" # For Bucharest
trail_url=f"/map/bounds/?token={key}&latlng={latlngbox}" #
my_data = pd.read_json(base_url + trail_url) # Joined parts of URL
print('columns->', my_data.columns) #2 cols ‘status’ and ‘data’ JSON
### Built a dataframe from the json file
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],
each_row['lat'],
each_row['lon'],
each_row['aqi']])
df = pd.DataFrame(all_rows, columns=['station_name', 'lat', 'lon', 'aqi'])
# Cleaned the DataFrame
df['aqi'] = pd.to_numeric(df.aqi, errors='coerce') # Invalid parsing to NaN
# Remove NaN entries in col
df1 = df.dropna(subset = ['aqi'])
Unfortunately it only retrieves 4 stations whereas there are many more available on the actual site. In the API documentation the only limitation I saw was for "1,000 (one thousand) requests per second" so why can't I get more of them?
Also, I've tried to modify the lat-long values and managed to get more stations, but they were outside the city I was interested in.
Here is a view of the actual perimeter I've used in the embedded code.
If you have any suggestions as of how I can solve this issue, I'd be very happy to read your thoughts. Thank you!
Try using waqi through aqicn... not exactly a clean API but I found it to work quite well
import pandas as pd
url1 = 'https://api.waqi.info'
# Get token from:- https://aqicn.org/data-platform/token/#/
token = 'XXX'
box = '113.805332,22.148942,114.434299,22.561716' # polygon around HongKong via bboxfinder.com
url2=f'/map/bounds/?latlng={box}&token={token}'
my_data = pd.read_json(url1 + url2)
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],each_row['lat'],each_row['lon'],each_row['aqi']])
df = pd.DataFrame(all_rows,columns=['station_name', 'lat', 'lon', 'aqi'])
From there its easy to plot
df['aqi'] = pd.to_numeric(df.aqi,errors='coerce')
print('with NaN->', df.shape)
df1 = df.dropna(subset = ['aqi'])
df2 = df1[['lat', 'lon', 'aqi']]
init_loc = [22.396428, 114.109497]
max_aqi = int(df1['aqi'].max())
print('max_aqi->', max_aqi)
m = folium.Map(location = init_loc, zoom_start = 5)
heat_aqi = HeatMap(df2, min_opacity = 0.1, max_val = max_aqi,
radius = 60, blur = 20, max_zoom = 2)
m.add_child(heat_aqi)
m
Or as such
centre_point = [22.396428, 114.109497]
m2 = folium.Map(location = centre_point,tiles = 'Stamen Terrain', zoom_start= 6)
for idx, row in df1.iterrows():
lat = row['lat']
lon = row['lon']
station = row['station_name'] + ' AQI=' + str(row['aqi'])
station_aqi = row['aqi']
if station_aqi > 300:
pop_color = 'red'
elif station_aqi > 200:
pop_color = 'orange'
else:
pop_color = 'green'
folium.Marker(location= [lat, lon],
popup = station,
icon = folium.Icon(color = pop_color)).add_to(m2)
m2
checking for stations within HK, returns 19
df[df['station_name'].str.contains('HongKong')]
I am using the edamam recipe api and have been trying to filter the response by only saving recipes with a number of calories > the max inputed by the user. I keep getting an error. This is the code:
import requests
import pandas as pd
def recipe_search(ingredient):
app_id = ''
app_key = ''
result = requests.get('https://api.edamam.com/search?q={}&app_id={}&app_key={}'.format(ingredient, app_id, app_key))
data = result.json()
return data['hits']
def run():
ingredient = input('Enter an ingredient: ')
max_no_of_calories = float(input('Enter the max amount of calories desired in recipe: '))
data_label = []
data_uri = []
data_calories = []
results = recipe_search(ingredient)
for result in results:
recipe = result['recipe']
result['calories'] < max_no_of_calories
data_label.append(recipe['label'])
data_uri.append(recipe['uri'])
data_calories.append(recipe['calories'])
data = {'Label': data_label,
'URL': data_uri,
'No of Calories': data_calories
}
df = pd.DataFrame(data, columns=['Label', 'URL'])
df.to_csv(r'C:\Users\name\Documents/cfg-python/export_dataframe.csv',
index=False, header=True)
run()
df2 = pd.read_csv(r'C:\Users\name\Documents/cfg-python/export_dataframe.csv')
sorted_df = df2.sort_values(by=["calories"], ascending=True)
sorted_df.to_csv(r'C:\Users\name\Documents/cfg-python/export_dataframe.csv', index=False)
This is the error:
if result['calories'] < max_no_of_calories:
KeyError: 'calories'
Is anyone able to help? How could I re-write this code with the filter of only recipes with under the max_no_of_calories? 'max_no_of_calories' is input by the user.
You made a typo in your for loop
for result in results:
recipe = result['recipe']
if recipe["calories"] < max_no_of_calories:
print(recipe["calories"])
This will get rid of your key error
I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')
I am trying to grab some stock related data from the web for my project.I encountered couple of problems.
Problem 1:
I tried to grab the table from this site http://sharesansar.com/c/today-share-price.html
It worked but the columns aren't grabbed in order.For eg: Column 'Company Name' has values of 'Open price'. How can I solve this?
Problem 2:
I also tried to grab a company specific data from http://merolagani.com/CompanyDetail.aspx?symbol=ADBL under 'Price History' tab.
This time I got an error while grabbing the table data.The error I got was:
self.data[key].append(cols[index].get_text())
IndexError: list index out of range
The code is as shown below:
import logging
import requests
from bs4 import BeautifulSoup
import pandas
module_logger = logging.getLogger('mainApp.dataGrabber')
class DataGrabberTable:
''' Grabs the table data from a certain url. '''
def __init__(self, url, csvfilename, columnName=[], tableclass=None):
module_logger.info("Inside 'DataGrabberTable' constructor.")
self.pgurl = url
self.tableclass = tableclass
self.csvfile = csvfilename
self.columnName = columnName
self.tableattrs = {'class':tableclass} #to be passed in find()
module_logger.info("Done.")
def run(self):
'''Call this to run the datagrabber. Returns 1 if error occurs.'''
module_logger.info("Inside 'DataGrabberTable.run()'.")
try:
self.rawpgdata = (requests.get(self.pgurl, timeout=5)).text
except Exception as e:
module_logger.warning('Error occured: {0}'.format(e))
return 1
#module_logger.info('Headers from the server:\n {0}'.format(self.rawpgdata.headers))
soup = BeautifulSoup(self.rawpgdata, 'lxml')
module_logger.info('Connected and parsed the data.')
table = soup.find('table',attrs = self.tableattrs)
rows = table.find_all('tr')[1:]
#initializing a dict in a format below
# data = {'col1' : [...], 'col2' : [...], }
#col1 and col2 are from columnName list
self.data = {}
self.data = dict(zip(self.columnName, [list() for i in range(len(self.columnName))]))
module_logger.info('Inside for loop.')
for row in rows:
cols = row.find_all('td')
index = 0
for key in self.data:
if index > len(cols): break
self.data[key].append(cols[index].get_text())
index += 1
module_logger.info('Completed the for loop.')
self.dataframe = pandas.DataFrame(self.data) #make pandas dataframe
module_logger.info('writing to file {0}'.format(self.csvfile))
self.dataframe.to_csv(self.csvfile)
module_logger.info('written to file {0}'.format(self.csvfile))
module_logger.info("Done.")
return 0
def getData(self):
""""Returns 'data' dictionary."""
return self.data
# Usage example
def main():
url = "http://sharesansar.com/c/today-share-price.html"
classname = "table"
fname = "data/sharesansardata.csv"
cols = [str(i) for i in range(18)] #make a list of columns
'''cols = [
'S.No', 'Company Name', 'Symbol', 'Open price', 'Max price',
'Min price','Closing price', 'Volume', 'Previous closing',
'Turnover','Difference',
'Diff percent', 'Range', 'Range percent', '90 days', '180 days',
'360 days', '52 weeks high', '52 weeks low']'''
d = DataGrabberTable(url, fname, cols, classname)
if d.run() is 1:
print('Data grabbing failed!')
else:
print('Data grabbing done.')
if __name__ == '__main__':
main()
Few suggestions would help.Thank you!
Your col list is missing an element there are 19 columns, not 18:
>>> len([str(i) for i in range(18)])
18
Besides you seem to over complicate things. The following should do:
import requests
from bs4 import BeautifulSoup
import pandas as pd
price_response = requests.get('http://sharesansar.com/c/today-share-price.html')
price_table = BeautifulSoup(price_response.text, 'lxml').find('table', {'class': 'table'})
price_rows = [[cell.text for cell in row.find_all(['th', 'td'])] for row in price_table.find_all('tr')]
price_df = pd.DataFrame(price_rows[1:], columns=price_rows[0])
com_df = None
for symbol in price_df['Symbol']:
comp_response = requests.get('http://merolagani.com/CompanyDetail.aspx?symbol=%s' % symbol)
comp_table = BeautifulSoup(comp_response.text, 'lxml').find('table', {'class': 'table'})
com_header, com_value = list(), list()
for tbody in comp_table.find_all('tbody'):
comp_row = tbody.find('tr')
com_header.append(comp_row.find('th').text.strip().replace('\n', ' ').replace('\r', ' '))
com_value.append(comp_row.find('td').text.strip().replace('\n', ' ').replace('\r', ' '))
df = pd.DataFrame([com_value], columns=com_header)
com_df = df if com_df is None else pd.concat([com_df, df])
print(price_df)
print(com_df)
I'm trying to download data from OECD API (https://data.oecd.org/api/sdmx-json-documentation/) into python.
I managed to download data in SDMX-JSON format (and transform it to JSON) so far:
OECD_ROOT_URL = "http://stats.oecd.org/SDMX-JSON/data"
def make_OECD_request(dsname, dimensions, params = None, root_dir = OECD_ROOT_URL):
"""Make URL for the OECD API and return a response"""
"""4 dimensions: location, subject, measure, frequency"""
if not params:
params = {}
dim_args = ['+'.join(d) for d in dimensions]
dim_str = '.'.join(dim_args)
url = root_dir + '/' + dsname + '/' + dim_str + '/all'
print('Requesting URL ' + url)
return rq.get(url = url, params = params)
response = make_OECD_request('MEI'
, [['USA', 'CZE'], [], [], ['M']]
, {'startTime': '2009-Q1', 'endTime': '2010-Q1'})
if (response.status_code == 200):
json = response.json()
How can I transform the data set into pandas.DataFrame? I tried pandas.read_json() and pandasdmx library, but I was not able to solve this.
The documentation the original question points to does not (yet?) mention that the API accepts the parameter contentType, which may be set to csv. That makes it trivial to use with Pandas.
import pandas as pd
def get_from_oecd(sdmx_query):
return pd.read_csv(
f"https://stats.oecd.org/SDMX-JSON/data/{sdmx_query}?contentType=csv"
)
print(get_from_oecd("MEI_FIN/IRLT.AUS.M/OECD").head())
Update:
The function to automatically download the data from OECD API is now available in my Python library CIF (abbreviation for the Composite Indicators Framework, installable via pip):
from cif import cif
data, subjects, measures = cif.createDataFrameFromOECD(countries = ['USA'], dsname = 'MEI', frequency = 'M')
Original answer:
If you need your data in Pandas DataFrame format, it is IMHO better to send your request to OECD with additional parameter 'dimensionAtObservation': 'AllDimensions', which results in more comprehensive JSON file.
Use following functions to download the data:
import requests as rq
import pandas as pd
import re
OECD_ROOT_URL = "http://stats.oecd.org/SDMX-JSON/data"
def make_OECD_request(dsname, dimensions, params = None, root_dir = OECD_ROOT_URL):
# Make URL for the OECD API and return a response
# 4 dimensions: location, subject, measure, frequency
# OECD API: https://data.oecd.org/api/sdmx-json-documentation/#d.en.330346
if not params:
params = {}
dim_args = ['+'.join(d) for d in dimensions]
dim_str = '.'.join(dim_args)
url = root_dir + '/' + dsname + '/' + dim_str + '/all'
print('Requesting URL ' + url)
return rq.get(url = url, params = params)
def create_DataFrame_from_OECD(country = 'CZE', subject = [], measure = [], frequency = 'M', startDate = None, endDate = None):
# Request data from OECD API and return pandas DataFrame
# country: country code (max 1)
# subject: list of subjects, empty list for all
# measure: list of measures, empty list for all
# frequency: 'M' for monthly and 'Q' for quarterly time series
# startDate: date in YYYY-MM (2000-01) or YYYY-QQ (2000-Q1) format, None for all observations
# endDate: date in YYYY-MM (2000-01) or YYYY-QQ (2000-Q1) format, None for all observations
# Data download
response = make_OECD_request('MEI'
, [[country], subject, measure, [frequency]]
, {'startTime': startDate, 'endTime': endDate, 'dimensionAtObservation': 'AllDimensions'})
# Data transformation
if (response.status_code == 200):
responseJson = response.json()
obsList = responseJson.get('dataSets')[0].get('observations')
if (len(obsList) > 0):
print('Data downloaded from %s' % response.url)
timeList = [item for item in responseJson.get('structure').get('dimensions').get('observation') if item['id'] == 'TIME_PERIOD'][0]['values']
subjectList = [item for item in responseJson.get('structure').get('dimensions').get('observation') if item['id'] == 'SUBJECT'][0]['values']
measureList = [item for item in responseJson.get('structure').get('dimensions').get('observation') if item['id'] == 'MEASURE'][0]['values']
obs = pd.DataFrame(obsList).transpose()
obs.rename(columns = {0: 'series'}, inplace = True)
obs['id'] = obs.index
obs = obs[['id', 'series']]
obs['dimensions'] = obs.apply(lambda x: re.findall('\d+', x['id']), axis = 1)
obs['subject'] = obs.apply(lambda x: subjectList[int(x['dimensions'][1])]['id'], axis = 1)
obs['measure'] = obs.apply(lambda x: measureList[int(x['dimensions'][2])]['id'], axis = 1)
obs['time'] = obs.apply(lambda x: timeList[int(x['dimensions'][4])]['id'], axis = 1)
obs['names'] = obs['subject'] + '_' + obs['measure']
data = obs.pivot_table(index = 'time', columns = ['names'], values = 'series')
return(data)
else:
print('Error: No available records, please change parameters')
else:
print('Error: %s' % response.status_code)
You can create requests like these:
data = create_DataFrame_from_OECD(country = 'CZE', subject = ['LOCOPCNO'])
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'Q', startDate = '2009-Q1', endDate = '2010-Q1')
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'M', startDate = '2009-01', endDate = '2010-12')
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'M', subject = ['B6DBSI01'])
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'Q', subject = ['B6DBSI01'])
You can recover the data from the source using code like this.
from urllib.request import urlopen
import json
URL = 'http://stats.oecd.org/SDMX-JSON/data/MEI/USA+CZE...M/all'
response = urlopen(URL).read()
responseDict = json.loads(str(response)[2:-1])
print (responseDict.keys())
print (len(responseDict['dataSets']))
Here is the output from this code.
dict_keys(['header', 'structure', 'dataSets'])
1
If you are curious about the appearance of the [2:-1] (I would be) it's because for some reason unknown to me the str function leaves some extraneous characters at the beginning and end of the string when it converts the byte array passed to it. json.loads is documented to require a string as input.
This is the code I used to get to this point.
>>> from urllib.request import urlopen
>>> import json
>>> URL = 'http://stats.oecd.org/SDMX-JSON/data/MEI/USA+CZE...M/all'
>>> response = urlopen(URL).read()
>>> len(response)
9886387
>>> response[:50]
b'{"header":{"id":"1975590b-346a-47ee-8d99-6562ccc11'
>>> str(response[:50])
'b\'{"header":{"id":"1975590b-346a-47ee-8d99-6562ccc11\''
>>> str(response[-50:])
'b\'"uri":"http://www.oecd.org/contact/","text":""}]}}\''
I understand that this is not a complete solution as you must still crack into the dataSets structure for the data to put into pandas. It's a list but you could explore it starting with this sketch.
The latest release of pandasdmx (pandasdmx.readthedocs.io) fixes previous issues accessing OECD data in sdmx-json.