I would like to ask, what I am doing wrong in code that I am not getting all data from website API?
For example, in my JSON is missing data 2022-01-03 8:00 Vikings Exchange.
Website: https://www.nasdaqomxnordic.com/news/companynews.
import requests
import json
import time
import csv
import pandas
start=250
with open('C:/Users/apskaita3/Desktop/number2.txt', "r") as f:
start= f.readlines()
start=int(start[0])
start=start + 70
results = {"item": {}}
# Todo load json
for i in range(0,9800): #<----- Just change range here to increase number of requests
URL = f"https://api.news.eu.nasdaq.com/news/query.action?type=handleResponse&showAttachments=true&showCnsSpecific=true&showCompany=true&countResults=false&freeText=&company=&market=Main%20Market%2C+Helsinki&cnscategory=&fromDate=&toDate=&globalGroup=exchangeNotice&globalName=NordicMainMarkets&displayLanguage=en&language=en&timeZone=CET&dateMask=yyyy-MM-dd+HH%3Amm%3Ass&limit=19&start={i}&dir=ASC"
r = requests.get(url = URL)
#time.sleep(1)
res = r.text.replace("handleResponse(", "")
#print(res)
#print(f'r is {r}')
res_json = json.loads(res)
#print(res_json)
data = res_json
a=i+1
#print(data)
print("Doing: " + str(i + 1) + "th")
#data = r.json()
downloaded_entries = data["results"]["item"]
new_entries = [d for d in downloaded_entries if d["headline"] not in results["item"]]
start=str(start)
for entry in new_entries:
if entry["market"] == 'Main Market, Helsinki' and entry["published"]>="2021-10-20 06:30:00":
headline = entry["headline"].strip()
published = entry["published"]
market=entry["market"]
market="Main Market, Helsinki"
results["item"][headline] = {"company": entry["company"], "messageUrl": entry["messageUrl"], "published": entry["published"], "headline": headline}
print(entry['market'])
#time.sleep(5)
print(f"Market: {market}/nDate: {published}/n")
#print( results["item"][headline] )
#print(results)
#print(json.dumps({"item": list(results["item"].values())}, indent = 4))
with open("C:/Users/apskaita3/Finansų analizės ir valdymo sprendimai, UAB/Rokas Toomsalu - Power BI analitika/Integracijos/1_Public comapnies analytics/Databasesets/Others/market_news_helsinki.json", "w") as outfile:
json_object = json.dumps({"item": list(results["item"].values())}, indent = 4)
outfile.write(json_object)
#print(json_object)
with open("C:/Users/apskaita3/Desktop/number2.txt", "w") as outfile1:
outfile1.write(start) # type: ignore
I expecting to get all data from website, Helsinki market. I have tried to fetch data, I am getting a big part of data but not all.
Related
import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image
I am having trouble parsing the market cap (it is a number) from an API call. I get the data, but for purposes of the program i am developing i just need the number and not the other stuff.
import csv
import intrinio
import requests
import json
import re
api_username = "b9cb2b8cbda8dde39f27a21f66e12afd"
api_password = "6d71a6dd01dd554f92a03f0e1b40dd44"
# CSV_URL = 'https://api.intrinio.com/financials/reported.csv?identifier=AAPL&statement=income_statement&fiscal_year=2015&fiscal_period=FY'
# CSV_URL2 = 'https://api.intrinio.com/financials/standardized.csv?identifier=AAPL&statement=balance_sheet&type=FY&fiscal_period=FY&date=2017-05-20'
CSV_URL3 = 'https://api.intrinio.com/data_point?identifier=AAPL&item=marketcap'
with requests.Session() as s:
download = s.get(CSV_URL3,auth=(api_username, api_password))
decoded_content = download.content.decode('utf-8')
cr = csv.reader(decoded_content.splitlines(), delimiter=',')
my_list = list(cr)
fx = open(r'test3.csv', 'w')
for item in my_list:
fx.write("%s\n" % item)
str1 = my_list[0][2]
num = re.findall('\d+', str1)
final_number = float(num[0])
print(final_number)
I have writen some python code o help me pull data from an API. The first version of my program work quite well.
N0w i am trying to develop a more DRY version of the code by introducing functions and loops . I am still new to python.
Your proffesional advice will be really apreciated
import requests
import json
# Bika Lims Authentication details
username = 'musendamea'
password = '!Am#2010#bgl;'
# API url Calls for patients, analysis and cases
patient_url = "adress1"
analysis_url = "adress2"
cases_url = "adress3"
# peform API calls and parse json data
patient_data = requests.get(patient_url, auth=(username, password ))
analysis_data = requests.get(analysis_url, auth=(username, password ))
cases_data = requests.get(cases_url, auth=(username, password ))
patients = json.loads(patient_data.text)
analysis = json.loads(analysis_data.text)
cases = json.loads(cases_data.text)
# checks for errors if any
print ("Patients")
print (patients['error'])
print (patients['success'])
print (patients['last_object_nr'])
print (patients['total_objects'])
print ("\n Analysis")
print (analysis['error'])
print (analysis['success'])
print (analysis['last_object_nr'])
print (analysis['total_objects'])
print ("\n Cases")
print (cases['error'])
print (cases['success'])
print (cases['last_object_nr'])
print (cases['total_objects'])
# create and save json files for patients, analysis and cases
with open('patients.json', 'w') as outfile:
json.dump(patients['objects'], outfile)
with open('analysis.json', 'w') as outfile1:
json.dump(analysis['objects'], outfile1)
with open('cases.json', 'w') as outfile2:
json.dump(cases['objects'], outfile2)
The Above code works pretty well but my challenge is making the code DRY. somehow the loop breaks when i change the following section
your_domain = "10.0.0.191"
data_types = ['patients', 'analysis', 'cases']
checkers = ['error', 'success', 'total_objects']
urls = []
data_from_api = []
# API url Call
base_url = "http://" + your_domain + "/##API/read?"
page_size = "1000000000000000000"
patient_url = base_url + "catalog_name=bika_patient_catalog&page_size="
+ page_size
analysis_url = base_url + "portal_type=AnalysisRequest&
review_state=published&page_size=" + page_size
cases_url = base_url + "portal_type=Batch&page_size=" + page_size
urls.append(patient_url)
urls.append(analysis_url)
urls.append(cases_url)
# peform API calls and parse json data
def BikaApiCalls(urls, username, password):
for i in len(urls):
data_ = requests.get(urls[i - 1], auth = (username, password))
print (data_types[i] + " ~ status_code: ")
print (data_.status_code + "\n")
data_from_api.append(json.loads(data_.text))
for val in len(checkers):
print (data_from_api[i][val])
BikaApiCalls(urls, username, password)
# Write JSON files
def WriteJson(data_types, data_from_api):
for i in len(data_from_api):
with open(data_types[i] + '.json', 'w') as outfile:
json.dump(data_from_api[i]['objects'], outfile)
WriteJson(data_types, data_from_api)
Where am I getting it wrong. I tried some debugging but i ca seen to get through. Id really appreciate your help.
Thanks in advance :)
I have a script to extract data from here: http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/
Part of obtaining the data in the script looks like this:
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
I see that the names like fg and pts correspond to the table headlines, but I don't understand why certain ones are abbreviated in the script.
I want to modify the script to obtain the headlines on this table: http://espn.go.com/nba/statistics/player/_/stat/rebounds. I tried doing this by just plugging in the names as they appear at the top of the table but the resulting CSV file had missing information.
Full code :
import os
import csv
import time
import urllib2
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes'
def get_data():
try:
req = urllib2.Request(uri)
response = urllib2.urlopen(req, timeout=600)
content = response.read()
return content
except Exception, e:
print "\n[!] Error: " + str(e)
print ''
return False
def extract(data,rk):
print '\n[+] Extracting data.'
start = 0
while True:
store = [rk]
if data.find('nba/player/',start) == -1:
break
with open("data.csv", "ab") as fcsv:
main = data.find('nba/player/',start)
name_start = data.find('>',main) + 1
name_end = data.find('<',name_start)
store.append(data[name_start:name_end])
team_start = data.find('">',name_end) + 2
team_end = data.find('<',team_start)
store.append(data[team_start:team_end])
gp_start = data.find(' >',team_end) + 2
gp_end = data.find('<',gp_start)
store.append(data[gp_start:gp_end])
mpg_start = data.find(' >',gp_end) + 2
mpg_end = data.find('<',mpg_start)
store.append(data[mpg_start:mpg_end])
pts_start = data.find('">',mpg_end) + 2
pts_end = data.find('<',pts_start)
store.append(data[pts_start:pts_end])
mf_start = data.find(' >',pts_end) + 2
mf_end = data.find('<',mf_start)
store.append(data[mf_start:mf_end])
fg_start = data.find(' >',mf_end) + 2
fg_end = data.find('<',fg_start)
store.append(data[fg_start:fg_end])
m3_start = data.find(' >',fg_end) + 2
m3_end = data.find('<',m3_start)
store.append(data[m3_start:m3_end])
p3_start = data.find(' >',m3_end) + 2
p3_end = data.find('<',p3_start)
store.append(data[p3_start:p3_end])
ft_start = data.find(' >',p3_end) + 2
ft_end = data.find('<',ft_start)
store.append(data[ft_start:ft_end])
ftp_start = data.find(' >',ft_end) + 2
ftp_end = data.find('<',ftp_start)
store.append(data[ftp_start:ftp_end])
start = name_end
rk = rk + 1
csv.writer(fcsv).writerow(store)
fcsv.close()
def main():
print "\n[+] Initializing..."
if not os.path.exists("data.csv"):
with open("data.csv", "ab") as fcsv:
csv.writer(fcsv).writerow(["RK","PLAYER","TEAM","GP", "MPG","PTS","FGM-FGA","FG%","3PM-3PA","3P%","FTM-FTA","FT%"])
fcsv.close()
rk = 1
global uri
while True:
time.sleep(1)
start = 0
print "\n[+] Getting data, please wait."
data = get_data()
if not data:
break
extract(data,rk)
print "\n[+] Preparing for next page."
time.sleep(1.5)
rk = rk + 40
if rk > 300:
print "\n[+] All Done !\n"
break
uri = 'http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/sort/avg48Points/count/' + str(rk)
if __name__ == '__main__':
main()
I specifically want to know how to grab info based on the headlines. Like TEAM GP MPG PTS FGM-FGA FG% 3PM-3PA 3P% FTM-FTA FT%
So the script doesn't need to be changed besides things like pts or mpg in pts_start = data.find('">',mpg_end) + 2
I don't understand why I can't just input the name of the headline in the table has shown for certain ones. Like instead of FTM-FTA, the script puts ft.
Extracting html data rather easy with BeautifulSoup. Following example is you to get the idea but not a complete solution to your problem. However you can easily extend.
from bs4 import BeautifulSoup
import urllib2
def get_html_page_dom(url):
response = urllib2.urlopen(url)
html_doc = response.read()
return BeautifulSoup(html_doc, 'html5lib')
def extract_rows(dom):
table_rows = dom.select('.mod-content tbody tr')
for tr in table_rows:
# skip headers
klass = tr.get('class')
if klass is not None and 'colhead' in klass:
continue
tds = tr.select('td')
yield {'RK': tds[0].string,
'PLAYER': tds[1].select('a')[0].string,
'TEAM': tds[2].string,
'GP': tds[3].string
# you can fetch rest of the indexs for corresponding headers
}
if __name__ == '__main__':
dom = get_html_page_dom('http://espn.go.com/nba/statistics/player/_/stat/scoring-per-48-minutes/')
for data in extract_rows(dom):
print(data)
You can simply run and see the result ;).
I'm using the following code on ScraperWiki to search Twitter for a specific hashtag.
It's working great and is picking out any postcode provided in the tweet (or returning false if none is available). This is achieved with the line data['location'] = scraperwiki.geo.extract_gb_postcode(result['text']).
But I'm only interested in tweets which include postcode information (this is because they're going to be added to a Google Map at a later stage).
What would be the easiest way to do this? I'm relatively au fait with PHP, but Python's a completely new area for me.
Thanks in advance for your help.
Best wishes,
Martin
import scraperwiki
import simplejson
import urllib2
QUERY = 'enter_hashtag_here'
RESULTS_PER_PAGE = '100'
NUM_PAGES = 10
for page in range(1, NUM_PAGES+1):
base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&page=%s' \
% (urllib2.quote(QUERY), RESULTS_PER_PAGE, page)
try:
results_json = simplejson.loads(scraperwiki.scrape(base_url))
for result in results_json['results']:
#print result
data = {}
data['id'] = result['id']
data['text'] = result['text']
data['location'] = scraperwiki.geo.extract_gb_postcode(result['text'])
data['from_user'] = result['from_user']
data['created_at'] = result['created_at']
print data['from_user'], data['text']
scraperwiki.sqlite.save(["id"], data)
except:
print 'Oh dear, failed to scrape %s' % base_url
break
Do you just want this? I tried on the free ScraperWiki test page and seems to do what you want. If you're looking for something more complicated, let me know.
import scraperwiki
import simplejson
import urllib2
QUERY = 'meetup'
RESULTS_PER_PAGE = '100'
NUM_PAGES = 10
for page in range(1, NUM_PAGES+1):
base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&page=%s' \
% (urllib2.quote(QUERY), RESULTS_PER_PAGE, page)
try:
results_json = simplejson.loads(scraperwiki.scrape(base_url))
for result in results_json['results']:
#print result
data = {}
data['id'] = result['id']
data['text'] = result['text']
data['location'] = scraperwiki.geo.extract_gb_postcode(result['text'])
data['from_user'] = result['from_user']
data['created_at'] = result['created_at']
if data['location']:
print data['location'], data['from_user']
scraperwiki.sqlite.save(["id"], data)
except:
print 'Oh dear, failed to scrape %s' % base_url
break
Outputs:
P93JX VSDC
FV36RL Bootstrappers
Ci76fP Eli_Regalado
UN56fn JasonPalmer1971
iQ3H6zR GNOTP
Qr04eB fcnewtech
sE79dW melindaveee
ud08GT MariaPanlilio
c9B8EE akibantech
ay26th Thepinkleash
I've refined it a bit so it's a bit picker than the scraperwiki check for extracting gb postcodes, which lets though quite a few false positives. Basically I took the accepted answer from here, and added some negative lookbehind/lookahead to filter out a few more. It looks like the scraper wiki check does the regex without the negative lookbehind/lookahead. Hope that helps a bit.
import scraperwiki
import simplejson
import urllib2
import re
QUERY = 'sw4'
RESULTS_PER_PAGE = '100'
NUM_PAGES = 10
postcode_match = re.compile('(?<![0-9A-Z])([A-PR-UWYZ0-9][A-HK-Y0-9][AEHMNPRTVXY0-9]?[ABEHMNPRVWXY0-9]? {0,2}[0-9][ABD-HJLN-UW-Z]{2}|GIR 0AA)(?![0-9A-Z])', re.I)
for page in range(1, NUM_PAGES+1):
base_url = 'http://search.twitter.com/search.json?q=%s&rpp=%s&page=%s' \
% (urllib2.quote(QUERY), RESULTS_PER_PAGE, page)
try:
results_json = simplejson.loads(scraperwiki.scrape(base_url))
for result in results_json['results']:
#print result
data = {}
data['id'] = result['id']
data['text'] = result['text']
data['location'] = scraperwiki.geo.extract_gb_postcode(result['text'])
data['from_user'] = result['from_user']
data['created_at'] = result['created_at']
if data['location'] and postcode_match.search(data['text']):
print data['location'], data['text']
scraperwiki.sqlite.save(["id"], data)
except:
print 'Oh dear, failed to scrape %s' % base_url
break