I am having trouble parsing the market cap (it is a number) from an API call. I get the data, but for purposes of the program i am developing i just need the number and not the other stuff.
import csv
import intrinio
import requests
import json
import re
api_username = "b9cb2b8cbda8dde39f27a21f66e12afd"
api_password = "6d71a6dd01dd554f92a03f0e1b40dd44"
# CSV_URL = 'https://api.intrinio.com/financials/reported.csv?identifier=AAPL&statement=income_statement&fiscal_year=2015&fiscal_period=FY'
# CSV_URL2 = 'https://api.intrinio.com/financials/standardized.csv?identifier=AAPL&statement=balance_sheet&type=FY&fiscal_period=FY&date=2017-05-20'
CSV_URL3 = 'https://api.intrinio.com/data_point?identifier=AAPL&item=marketcap'
with requests.Session() as s:
download = s.get(CSV_URL3,auth=(api_username, api_password))
decoded_content = download.content.decode('utf-8')
cr = csv.reader(decoded_content.splitlines(), delimiter=',')
my_list = list(cr)
fx = open(r'test3.csv', 'w')
for item in my_list:
fx.write("%s\n" % item)
str1 = my_list[0][2]
num = re.findall('\d+', str1)
final_number = float(num[0])
print(final_number)
Related
I would like to ask, what I am doing wrong in code that I am not getting all data from website API?
For example, in my JSON is missing data 2022-01-03 8:00 Vikings Exchange.
Website: https://www.nasdaqomxnordic.com/news/companynews.
import requests
import json
import time
import csv
import pandas
start=250
with open('C:/Users/apskaita3/Desktop/number2.txt', "r") as f:
start= f.readlines()
start=int(start[0])
start=start + 70
results = {"item": {}}
# Todo load json
for i in range(0,9800): #<----- Just change range here to increase number of requests
URL = f"https://api.news.eu.nasdaq.com/news/query.action?type=handleResponse&showAttachments=true&showCnsSpecific=true&showCompany=true&countResults=false&freeText=&company=&market=Main%20Market%2C+Helsinki&cnscategory=&fromDate=&toDate=&globalGroup=exchangeNotice&globalName=NordicMainMarkets&displayLanguage=en&language=en&timeZone=CET&dateMask=yyyy-MM-dd+HH%3Amm%3Ass&limit=19&start={i}&dir=ASC"
r = requests.get(url = URL)
#time.sleep(1)
res = r.text.replace("handleResponse(", "")
#print(res)
#print(f'r is {r}')
res_json = json.loads(res)
#print(res_json)
data = res_json
a=i+1
#print(data)
print("Doing: " + str(i + 1) + "th")
#data = r.json()
downloaded_entries = data["results"]["item"]
new_entries = [d for d in downloaded_entries if d["headline"] not in results["item"]]
start=str(start)
for entry in new_entries:
if entry["market"] == 'Main Market, Helsinki' and entry["published"]>="2021-10-20 06:30:00":
headline = entry["headline"].strip()
published = entry["published"]
market=entry["market"]
market="Main Market, Helsinki"
results["item"][headline] = {"company": entry["company"], "messageUrl": entry["messageUrl"], "published": entry["published"], "headline": headline}
print(entry['market'])
#time.sleep(5)
print(f"Market: {market}/nDate: {published}/n")
#print( results["item"][headline] )
#print(results)
#print(json.dumps({"item": list(results["item"].values())}, indent = 4))
with open("C:/Users/apskaita3/Finansų analizės ir valdymo sprendimai, UAB/Rokas Toomsalu - Power BI analitika/Integracijos/1_Public comapnies analytics/Databasesets/Others/market_news_helsinki.json", "w") as outfile:
json_object = json.dumps({"item": list(results["item"].values())}, indent = 4)
outfile.write(json_object)
#print(json_object)
with open("C:/Users/apskaita3/Desktop/number2.txt", "w") as outfile1:
outfile1.write(start) # type: ignore
I expecting to get all data from website, Helsinki market. I have tried to fetch data, I am getting a big part of data but not all.
import pandas as pd
import requests
import json
import datetime
import csv
def get_pushshift_data(after, before, sub):
url = 'https://api.pushshift.io/reddit/search/submission/?&after=' + str(after) + '&before='+ str(before) + '&subreddit='+ str(sub) + '&sort=asc&sort_type=created_utc&size=400'
print(url)
r = requests.get(url).json()
# data = json.loads(r.text, strict=False)
return r['data']
def collect_subData(subm):
subData = list() #list to store data points
title = subm['title']
url = subm['url']
try:
flair = subm['link_flair_text']
except KeyError:
flair = "NaN"
try:
# returns the body of the posts
body = subm['selftext']
except KeyError:
body = ''
author = subm['author']
subId = subm['id']
score = subm['score']
created = datetime.datetime.fromtimestamp(subm['created_utc']) #1520561700.0
numComms = subm['num_comments']
permalink = subm['permalink']
subData.append((subId,title,body,url,author,score,created,numComms,permalink,flair))
subStats[subId] = subData
def update_subFile():
upload_count = 0
location = "subreddit_data_uncleaned/"
print("Input filename of submission file, please add .csv")
filename = input()
file = location + filename
with open(file, 'w', newline='', encoding='utf-8') as file:
a = csv.writer(file, delimiter=',')
headers = ["Post ID","Title","Body","Url","Author","Score","Publish Date","Total No. of Comments","Permalink","Flair"]
a.writerow(headers)
for sub in subStats:
a.writerow(subStats[sub][0])
upload_count+=1
print(str(upload_count) + " submissions have been uploaded into a csv file")
# global dictionary to hold 'subData'
subStats = {}
# tracks no. of submissions
subCount = 0
#Subreddit to query
sub = 'politics'
# Unix timestamp of date to crawl from.
before = int(datetime.datetime(2021,5,17,0,0).timestamp())
after = int(datetime.datetime(2014,1,1,0,0).timestamp())
data = get_pushshift_data(after, before, sub)
while len(data) > 0:
for submission in data:
collect_subData(submission)
subCount+=1
# Calls getPushshiftData() with the created date of the last submission
print(len(data))
print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
after = data[-1]['created_utc']
data = get_pushshift_data(after, before, sub)
print(len(data))
update_subFile()
At line 1: I call the get_pushshift_data(after, before, sub) function to scrape the data and there is no error. But then when I want to the same thing again at line 11 but with different time for after variable(type: int), the program comes out the error of JSONDecodeError: Expecting value: line 1 column 1 (char 0).
This is the image for you to refer to which I have just described above
This is the Error Image
Since I am going to create a number of dataframes I know won't fit inside just a single google worksheet (because of the limitation of columns) I want to split the data into multiple worksheets. I'm using set_with_dataframe() and defining which worksheet the dataframes is going to get imported to, so my first thought was to create and define several worksheets and then use the same method - the problem is just that I don't know how to "split" the data when there's no more columns in the first worksheet (and then the second, and the third and so on...)
I'm quite new at working with Python and I have been stuck with this for days so any kind of help would be appreciated.
My code looks like this:
import gspread
from gspread_dataframe import get_as_dataframe, set_with_dataframe
from google.oauth2 import service_account
from google.auth.transport.requests import AuthorizedSession
from bs4 import BeautifulSoup
import pandas as pd
import requests
import traceback
import os
class DataScraper():
def __init__(self, sheets):
self.data_worksheet = sheets.data_worksheet
self.total_urls = 0
self.urls = self.getAllUrls(sheets.url_worksheet)
def getAllUrls(self, urlWorkSheet):
urls = urlWorkSheet.get_all_values()
finalUrls = []
for r in urls:
# Get all urls
modifiedUrls = [d for d in r[:14] if "https://" in d]
if len(modifiedUrls) != 0:
self.total_urls += len(modifiedUrls)
finalUrls.append(modifiedUrls)
return finalUrls
def StartScrape(self):
current_column_count = 1
last_data_frame_max_width = 0
current_element = 0
for urlRow in self.urls:
current_row_count = 1
for url in urlRow:
current_element += 1
error = False
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
try:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
labels = []
results = []
tbl = soup.find('table')
for tr in tbl.findAll('tr'):
headers = [th.text.strip() for th in tr.findAll('th')]
data = [td.text.strip() for td in tr.findAll('td')]
labels.append(headers)
results.append(data)
final_results = []
for final_labels, final_data in zip(labels, results):
final_results.append({'Labels': final_labels, 'Data': final_data})
df = pd.DataFrame(final_results)
df['Labels'] = df['Labels'].str[0]
df['Data'] = df['Data'].str[0]
indexNames = df[df['Labels'] == 'Links'].index
df.drop(indexNames , inplace=True)
set_with_dataframe(self.data_worksheet, df, col=current_column_count, row=current_row_count, include_column_header=False)
current_row_count += df.shape[0]+2
if df.shape[1] > last_data_frame_max_width:
last_data_frame_max_width = df.shape[1]
except Exception:
error = True
finally:
print(f"Processed page {current_element}/{self.total_urls} with status: {'success' if not error else 'error'}")
current_column_count += last_data_frame_max_width+5
last_data_frame_max_width = 0
class Sheets():
def __init__(self, filename, key):
self.filename = filename
self.key = key
self.data_worksheet = None
self.url_worksheet = None
self.getSheets(self.getCredentials())
def getCredentials(self):
# sep = seperator
_ = os.path.normpath(__file__).split(os.sep)
_.insert(1, "/")
credentials = service_account.Credentials.from_service_account_file(os.path.join(os.path.join(*_[0:-1]), self.filename))
return credentials.with_scopes( ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive'])
def getSheets(self, scoped_credentials):
gc = gspread.Client(auth=scoped_credentials)
gc.session = AuthorizedSession(scoped_credentials)
spreadsheet_key = gc.open_by_key(self.key)
# Get sheet with data import
self.data_worksheet = spreadsheet_key.worksheet("Data")
# Get list with url's
self.url_worksheet = url_worksheet = spreadsheet_key.worksheet("Felix Copy")
# Get sheets
sheets = Sheets("credentials.json", "key_id")
# Start scraping
scraper = DataScraper(sheets)
scraper.StartScrape()
I've basically created a spider that follows a set of links acquired from an API, and then extracts text from the HTML body. I'm trying to append returned items to appropriate lists, which are then added to a dictionary. When I run the code, the resultant JSON file only successfully writes the first line.
I am running Python 3.6 in a virtual environment on a Windows 10 64-bit machine, and I run pip-upgrade daily.
from nltk.corpus import stopwords
import smtplib
from time import sleep # To prevent overwhelming the server between connections
from bs4 import BeautifulSoup as soup
import scrapy
import mysql.connector as mariadb
import sys
from collections import Counter
from pprint import pprint
import json
import re
conn = mariadb.connect(user=dbuser, password=dbpassword, database=dbdatabase)
c = conn.cursor()
e = sys.exc_info()[0]
c.execute("Select URL FROM [TABLE]")
JobURLs = c.fetchall()
for object in JobURLs:
urls = []
url_string = str(object)
rx = re.compile('\W\W\W$')
res = rx.sub('', url_string)
rx = re.compile('^\W\W')
url = rx.sub('', res)
urls.append(url)
c.execute("Select JvId FROM [TABLE]")
JobIDs = c.fetchall()
for object in JobIDs:
item = {}
item['JvId'] = []
JobID_string = str(object)
rx = re.compile('\W\W\W$')
res = rx.sub('', JobID_string)
rx = re.compile('^\W\W')
JobID = rx.sub('', res)
item['JvId'].append(JobID)
class JobListing(scrapy.Spider):
name = 'JobListingCrawler'
start_urls = urls
def parse(self, response):
# pass
item['urlText'] = response.url
page_html = response.body
page_soup = soup(page_html, 'lxml')
for script in page_soup(['script', 'style']):
script.extract()
item['jobDescText'] = page_soup.get_text('''\n''', strip=True)
## TextCleaner Function for Word Counter
text = item['jobDescText'].replace('\n', ' ')
lines = [line.strip() for line in text.splitlines()]
chunks = [phrase.strip() for line in lines for phrase in line.split(' ')]
def chunk_space(chunk):
chunk_out = chunk + ' '
return chunk_out
text = ''.join(chunk_space(chunk) for chunk in chunks if chunk).encode('utf-8')
try:
text = text.decode('unicode_escape').encode('ascii', 'ignore')
except:
print(e)
pass
text = re.sub('[^a-zA-Z,+3]', ' ', str(text))
text = text.lower().split()
stop_words = set(stopwords.words('english'))
text = [word for word in text if not word in stop_words]
wordCounter = Counter(text)
item['wordCounter'] = str(wordCounter)
## And now we parse for email addresses!
prog = re.compile(r"[A-z0-9._%+-]+#[A-z0-9.-]+\.[A-z]{2,}")
found = prog.search(item['jobDescText'].replace('\n', ' '))
try:
item['email'] = str(found.group(0))
except:
item['email'] = 'null'
pass
filename = 'results.jl'
line = json.dumps(dict(item)) + '\n'
with open(filename, 'a') as f:
f.write(line)
self.log('Saved Line to %s' % filename)
You just need to declare a Scrapy Item, which contains yours returned fields definion.
After that, just need to config your setting file to allow Scrapy Feed Exports using the built-in JsonItemExporter for your extract data:
FEED_URI: file:///tmp/export.json
FEED_FORMAT: json
So silly me: I put the list variable within the For Loop, so each time the actions looped it would delete the previously written values. Moving them outside of the loop solved the problem.
c.execute("Select URL FROM CareerOneStopJobs")
JobURLs = c.fetchall()
urls = []
for element in JobURLs:
url_string = str(element)
rx = re.compile('\W\W\W$')
res = rx.sub('', url_string)
rx = re.compile('^\W\W')
url = rx.sub('', res)
urls.append(url)
c.execute("Select JvId FROM CareerOneStopJobs")
JobIDs = c.fetchall()
item = {}
for JobID in JobIDs:
item['JvId'] = []
JobID_string = str(JobID)
rx = re.compile('\W\W\W$')
res = rx.sub('', JobID_string)
rx = re.compile('^\W\W')
JobID = rx.sub('', res)
item['JvId'] = JobID
I'm getting a list index out of range error, and not sure why. My code is a webscraper to collect temperature data from a website. All worked fine for months, until recently.
I have a number of functions shown below as reference. The important one is getDailyAve(), which is where I'm getting the exception thrown.
Any thoughts or advice is appreciated.
import sys
import urllib
from bs4 import BeautifulSoup
from urllib2 import urlopen, URLError
import webbrowser
import time
from collections import Counter
import numpy as np
import re
import csv
import datetime
from datetime import timedelta
DATE_FORMAT = '%Y/%m/%d'
def daterange(start, end):
def convert(date):
try:
date = datetime.datetime.strptime(date, DATE_FORMAT)
return date.date()
except TypeError:
return date
def get_date(n):
return datetime.datetime.strftime(convert(start) + timedelta(days=n), DATE_FORMAT)
days = (convert(end) - convert(start)).days
if days <= 0:
raise ValueError('The start date must be before the end date.')
for n in range(0, days):
yield get_date(n)
class SiteLocation:
"""class defining mine location parameters to lookup on weather search"""
def __init__(self, city, state, zip, code):
self.city = city
self.state = state
self.zip = zip
self.code = code
def getDailyAve(url):
url = urllib.urlopen(url)
soup = BeautifulSoup(url.read(), 'lxml')
form = soup.find("form",{"id": "archivedate"})
table = form.find_next_sibling("table")
rows = table.select("tr")[1:]
time=[]
temp=[]
minutes=[]
# handle no data case
if soup.find(text="Archive data not available for this date."):
print("Data not available, URL: '%s'" % url)
return None
# capture time and temps
for row in rows:
data = [td.text for td in row.find_all("td")]
match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])",data[2])
if match:
temp.append(match.group())
time.append(data[0])
minutes.append(data[0][-4:-2])
common = Counter(minutes).most_common()[0][0]
finalTimes = []
finalTemps = []
for i in range(0,len(time)):
if minutes[i] == common:
finalTimes.append(time[i])
finalTemps.append(int(temp[i]))
dailyAve = sum(finalTemps) / float(len(finalTimes))
return dailyAve
def writeToCsv(list1, list2, list3, list4, list5, list6, list7, list8):
with open('results.csv', 'wb') as csvfile:
results = csv.writer(csvfile, delimiter=',')
results.writerow(['T-SJ', 'T- RB', 'T-DS', 'T-JW', 'T-GB', 'D', 'M', 'Y'])
for idx in range(0,len(list1)):
results.writerow([str(list1[idx]), str(list2[idx]), str(list3[idx]), str(list4[idx]), str(list5[idx]), str(list6[idx]), str(list7[idx]), str(list8[idx])])
def buildURL(location, day, month, year):
if day < 10:
strDay = '0'+str(day)
else:
strDay = str(day)
baseURL = "http://www.weatherforyou.com/reports/index.php?forecast=pass&pass=archive&zipcode=" + location.zip + "&pands=" + location.city + "%2" + "C" + location.state + "&place=" + location.city + "&state=" + location.state + "&icao=" + location.code + "&country=us&month=" + str(month) + "&day=" + strDay + "&year=" + str(year) + "&dosubmit=Go"
return baseURL
def main():
loc1 = SiteLocation('Farmington','NM','87401','KFMN')
loc2 = SiteLocation('Whitesville','WV','25209','KBKW')
loc3 = SiteLocation('Rangely','CO','81648','KVEL')
loc4 = SiteLocation('Brookwood','AL','35444','KTCL')
loc5 = SiteLocation('Princeton','IN','47670','KAJG')
start = '2016/08/31'
end = datetime.date.today()
dateRange = list(daterange(start, end))
listDailyAve1 = []
listDailyAve2 = []
listDailyAve3 = []
listDailyAve4 = []
listDailyAve5 = []
listDays = []
listMonths = []
listYears = []
for idx in range(0,len(dateRange)):
strDate = str(dateRange[idx]).split("/")
year = strDate[0]
month = strDate[1]
day = strDate[2]
url1 = buildURL(loc1, day, month, year)
url2 = buildURL(loc2, day, month, year)
url3 = buildURL(loc3, day, month, year)
url4 = buildURL(loc4, day, month, year)
url5 = buildURL(loc5, day, month, year)
dailyAve1 = getDailyAve(url1)
dailyAve2 = getDailyAve(url2)
dailyAve3 = getDailyAve(url3)
dailyAve4 = getDailyAve(url4)
dailyAve5 = getDailyAve(url5)
listDailyAve1.append(dailyAve1)
listDailyAve2.append(dailyAve2)
listDailyAve3.append(dailyAve3)
listDailyAve4.append(dailyAve4)
listDailyAve5.append(dailyAve5)
listDays.append(day)
listMonths.append(month)
listYears.append(year)
writeToCsv(listDailyAve1, listDailyAve2, listDailyAve3, listDailyAve4,listDailyAve5, listDays, listMonths, listYears)
if __name__ == '__main__':
status = main()
sys.exit(status)
Here is the exception thrown:
Traceback (most recent call last):
File ".\weatherScrape2.py", line 147, in <module>
status = main()
File ".\weatherScrape2.py", line 128, in main
dailyAve1 = getDailyAve(url1)
File ".\weatherScrape2.py", line 61, in getDailyAve
match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])",data[2])
IndexError: list index out of range
First of all, you need to handle situations when there is no available data. Here is one way:
# handle "no data" case
if soup.find(text="Archive data not available for this date."):
print("Data not available, URL: '%s'." % url)
return None
Also, I think there is a problem in the logic of getting the rows. I'd do it this way:
form = soup.find("form", {"id": "archivedate"})
table = form.find_next_sibling("table")
rows = table.select("tr")[1:]
Here is a complete snippet that I'm executing (for a single URL):
import requests
from bs4 import BeautifulSoup
from collections import Counter
import re
def getDailyAve(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
form = soup.find("form", {"id": "archivedate"})
table = form.find_next_sibling("table")
rows = table.select("tr")[1:]
time = []
temp = []
minutes = []
# handle no data case
if soup.find(text="Archive data not available for this date."):
print("Data not available, URL: '%s'" % url)
return None
# capture time and temps
for row in rows:
data = [td.text for td in row.find_all("td")]
match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])", data[2])
if match:
temp.append(match.group())
time.append(data[0])
minutes.append(data[0][-4:-2])
common = Counter(minutes).most_common()[0][0]
finalTimes = []
finalTemps = []
for i in range(0, len(time)):
if minutes[i] == common:
finalTimes.append(time[i])
finalTemps.append(int(temp[i]))
dailyAve = sum(finalTemps) / float(len(finalTimes))
return dailyAve
print(getDailyAve("https://www.weatherforyou.com/reports/index.php?forecast=pass&pass=archive&zipcode=87401&pands=Farmington%2CNM&place=Farmington&state=NM&icao=KFMN&country=us&month=09&day=03&year=2016&dosubmit=Go"))