Filling missing dates in python beautiful soup and pandas - python

I have this website from where I scraped data as CSV file. I was able to scrape the date and the price. however the date is in week format and I need to convert it into date format like daily prices for 5 working days. (mon-sat).I used python and pandas and beautiful soup for this. WHAT I GET AND WHAT I WANT FROM THIS SITE
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
from pandas import DataFrame
import csv
import pandas as pd
from urllib.request import urlopen
try:
html = urlopen("https://www.eia.gov/dnav/ng/hist/rngwhhdD.htm")
except HTTPError as e:
print(e)
except URLError:
print("Server down or incorrect domain")
else:
res = BeautifulSoup(html.read(),"html5lib")
price = res.findAll(class_=["tbody", "td", "B3"])
price_list = []
for tag in price:
price_tag=tag.getText()
price_list.append(price_tag)
print(price_tag)
date = res.findAll(class_=["tbody", "td", "B6"])
date_list = []
for tag in date:
date_tag=tag.getText()
date_list.append(date_tag)
print(date_tag)
d1 = pd.DataFrame({'Date': date_list})
d2 = pd.DataFrame({'Price': price_list})
df = pd.concat([d1,d2], axis=1)
print(df)
df.to_csv("Gas Price.csv", index=False, header=True)

Your actual Code create a list for each row and an list for each cell, this don't fits together.
Following script search the table (it is the only one that has the attribute summary) and loops over each row (tr). Than it gets from the Week column (td class B6) the first part before the " to " and convert it to an datetime.
For each cell (td class B3) it get the price (or empty string), set the date and increments the date.
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
from pandas import DataFrame
import csv
import pandas as pd
from urllib.request import urlopen
import datetime
try:
html = urlopen("https://www.eia.gov/dnav/ng/hist/rngwhhdD.htm")
except HTTPError as e:
print(e)
except URLError:
print("Server down or incorrect domain")
else:
res = BeautifulSoup(html.read(),"html5lib")
table = None
for t in res.findAll("table"):
table = t if "summary" in t.attrs else table
if table == None: exit()
# stop_date = datetime.datetime(year = 2018, month = 7, day = 12)
# today = datetime.datetime.now()
# abort = False
price_list = []
date_list = []
rows = table.findAll("tr")[1:]
for row in rows:
date = None
cells = row.findAll("td")
if cells[0].get("class") == None: continue # placeholder..
if "B6" in cells[0].get("class"):
d = cells[0].getText().split(" to ")[0].strip().replace(" ", "")
date = datetime.datetime.strptime(d,"%Y%b-%d")
for cell in cells:
if "B3" in cell.get("class"): # and abort == False:
price = cell.getText().strip()
if price == "" or price == "NA": price = ""
else: price = float(price)
price_list.append(price)
date_list.append(date)
date = date + datetime.timedelta(days=1)
#if date > today: abort = True
#if abort == True: break
d1 = pd.DataFrame({'Date': date_list})
d2 = pd.DataFrame({'Price': price_list})
df = pd.concat([d1,d2], axis=1)
print(df)
df.to_csv(r"Gas Price.csv", index=False, header=True)

I wasn't entirely clear what you wanted for Date but I extracted both and called them Start and End Date.
In:
df = pd.DataFrame({'Date': ['1997 Jan- 6 to Jan-10', '1997 Jan-13 to Jan-17'], 'Price': [3.80, 5.00] })
df['Temp_Year'] = df.Date.str.extract(r'((?:19|20)\d\d)')
df['Temp_Date'] = df.Date.str.replace(r'((?:19|20)\d\d)','')
df[['Start Date', 'End Date']] = df.Temp_Date.str.split('to', expand=True)
df['Start Date'] = pd.to_datetime(df['Temp_Year'] + ' ' + df['Start Date'].str.replace(" ",""))
df['End Date'] = pd.to_datetime(df['Temp_Year'] + ' ' + df['End Date'].str.replace(" ",""))
df.drop(['Temp_Year', 'Temp_Date'], axis=1)
Out:
| | Date | Price | Start Date | End Date |
|---|-----------------------|-------|------------|------------|
| 0 | 1997 Jan- 6 to Jan-10 | 3.8 | 1997-01-06 | 1997-01-10 |
| 1 | 1997 Jan-13 to Jan-17 | 5.0 | 1997-01-13 | 1997-01-17 |

Related

Removing unwanted characters/words

enter image description hereI'm struggling to remove some characters from the extracted data. I've managed to remove the '£' from the price and that's it.
Outcome:
What I am getting
Tried:
data = json.loads(r.text)
products = data['upcoming']
product_list = []
for product in products:
price = product['price']
date = product['launchDate']
productsforsale = {
'Retail_price': price,
'Launch_date': date,
}
product_list.append(productsforsale)
df = pd.DataFrame(product_list).replace('£',"")
df.to_csv('PATH.csv')
print('saved to file')
Expected outcome:
110.00 2023-01-15 08:00
You can get the amount from the price dictionary by price['amount']. The time can be converted to your desired timeformat with the datetime module:
from datetime import datetime
datetime_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ")
new_date = datetime_date.strftime("%Y-%m-%d %H:%M")
I can´t test it with your original .json snipped though.
You can format the time as so:
strftime
date = product['launchDate'].strftime("%Y-%m-%d %H:%M")
You're currently not correctly getting the price, you are extracting the whole [price] element, but you only want the amount within the price.
You can format the price as so:
price = product['price']['amount']
The full code
from datetime import datetime
data = json.loads(r.text)
products = data['upcoming']
df = pd.DataFrame()
for product in products:
price = product['price']['amount']
date = datetime.strptime(product['launchDate'], "%Y-%m-%dT%H:%M:%S.%fZ")
date = date.strftime("%Y-%m-%d %H:%M")
df = df.append({"Price": price, "Date": date}, ignore_index=True)
df.to_csv('PATH.csv')
print('saved to file')
This should save a csv with 2 columns, Price and Date, with all the unnecessary info removed

Yfiance print only the price of a stock

I want to print only the price for a stock form yfinance, this is what I get/have now :
ticker = "aapl"
start = datetime.now().strftime('%Y-%m-%d')
end = datetime.now().strftime('%Y-%m-%d')
data = pdr.get_data_yahoo(ticker, start, end)
data['EMA10'] = data['Close'].ewm(span=10, adjust=False).mean()
print(data['EMA10'])
and this is the response :
Date
2022-03-04 163.169998
Name: EMA10, dtype: float64
I only want to print 163....
You obtain a pd.Series. To select the top-most value within that series just do data['EMA10'][0].
The entire code is given below:
from datetime import datetime
import pandas_datareader as pdr
ticker = "AAPL"
start = datetime.now().strftime('%Y-%m-%d')
end = datetime.now().strftime('%Y-%m-%d')
data = pdr.get_data_yahoo(ticker, start, end)
data['EMA10'] = data['Close'].ewm(span=10, adjust=False).mean()
print(data['EMA10'][0])
Output:
163.1699981689453

Python If printed results are the same as before print no change otherwise print new results. Run again every 10 mins

I want to run this script every 10 minutes and if the results are the same today I don't want to print them again unless they change. Is this even possible? No I'm not a programmer by any means this is just a hobby.
I'm using Twilio to send me a sms message for campsites that are available but I don't want to receive the same sms message every 10 minutes. I removed the Twilio code because it has my account info. Thank you in advance for any help. Here is my code below.
from datetime import datetime
import pandas as pd
import requests
from tabulate import tabulate
result = []
for unit_id in range(5095, 5099):
resp = requests.get(
f"https://calirdr.usedirect.com/rdr/rdr/fd/"
f"availability/getbyunit/{unit_id}/startdate/2020-10-30/nights/30/true?").json()
result.extend(resp)
filter_by = ['UnitId', 'StartTime', 'IsFree', 'IsWalkin']
df = pd.DataFrame(result)
df = df.filter(items=filter_by)
df['StartTime'] = df['StartTime'].apply(lambda d: datetime.fromisoformat(d).strftime("%Y-%m-%d"))
df = df[df['IsFree']]
df = df[~df['IsWalkin']]
df['UnitId'] = df['UnitId'].replace([5095], 'Site 81')
df['UnitId'] = df['UnitId'].replace([5096], 'Site 82')
df['UnitId'] = df['UnitId'].replace([5097], 'Site 83')
df['UnitId'] = df['UnitId'].replace([5098], 'Site 84')
df['UnitId'] = df['UnitId'].replace([5099], 'Site 85')
print(tabulate(df, headers=filter_by))
Below are the results if you run the code.
UnitId StartTime IsFree IsWalkin
-- -------- ----------- -------- ----------
62 Site 83 2020-11-01 True False
80 Site 83 2020-11-19 True False
89 Site 83 2020-11-28 True False
Process finished with exit code 0
This will run the programm, wait ten minutes, check if the previous result is the same as the current one and if yes, then quit. So the part for you is now to figure out, how to quit it only until the next day :)
//EDIT: I edited the code corresponding to your comment
from datetime import datetime
import pandas as pd
import requests
from tabulate import tabulate
import time
def main():
result = []
for unit_id in range(5095, 5099):
resp = requests.get(
f"https://calirdr.usedirect.com/rdr/rdr/fd/"
f"availability/getbyunit/{unit_id}/startdate/2020-10-30/nights/30/true?").json()
result.extend(resp)
filter_by = ['UnitId', 'StartTime', 'IsFree', 'IsWalkin']
df = pd.DataFrame(result)
df = df.filter(items=filter_by)
df['StartTime'] = df['StartTime'].apply(lambda d: datetime.fromisoformat(d).strftime("%Y-%m-%d"))
df = df[df['IsFree']]
df = df[~df['IsWalkin']]
df['UnitId'] = df['UnitId'].replace([5095], 'Site 81')
df['UnitId'] = df['UnitId'].replace([5096], 'Site 82')
return tabulate(df, headers=filter_by)
res_before = ""
while True:
res = main()
if res != res_before:
print(res)
res_before = res
else:
print("nothing changed")
time.sleep(600)

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

pandas does not append to df as it should of each line of iteration

I have a df that does not behave. Please help me train it!
I need for each of my iterations that goes through to the deepest nested 'if' statement (that satisfies all my requirements), to be appended do my df 'df_comp_KPIs'.
Why does this code not work?
Any ideas
import time
import urllib.request, urllib.error, urllib.parse
import pandas as pd
import csv
import urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup as bs4
start = time.time() # Start script timer
# Creating the df that will save my results in the Yahoo KPI iterations
#global df_comp_KPIs
df_comp_KPIs = pd.DataFrame() #columns = ('Ticker','Mark.Cap','PriceToBook','PEG5','TrailPE12Mo','DeptToEquit.')
ofInterest = ['AAN', 'ANF', 'ANCX', 'ACE', 'ATVI', 'AET', 'AGCO', 'ATSG', 'AWH', 'ALL', 'AFAM', 'ALJ']
evenBetter = []
# add some more to powers as necessary
powers = {'M': 10 ** 6, 'B': 10 ** 9, 'T': 10 ** 12}
# Convert the string from Market Cap col to float.
def stringNoToFloat(s):
try:
power = s[-1]
return float(s[:-1]) * powers[power]
except TypeError:
return s
#df.applymap(stringNoToFloat)
def yahoostats(ticker):
try:
print ('doing',ticker)
url = 'http://finance.yahoo.com/q/ks?s='+ticker
page = urllib.request.urlopen(url)
soup = bs4(page)
page.close()
# Lookup pbr and return the next 'td' tag-content
mcap = soup.find(text='Market Cap (intraday)').findNext('td').string
# print('Market Cap:', mcap)
pbr = float(soup.find(text='Price/Book (mrq):').findNext('td').string)
# print('Ticker %s, Price to book ratio: %1.2f' %(ticker, float(pbr))) # format 2 dig.
# print()
if float(pbr) < 3:
# print ('price to book ratio:',ticker,pbr)
PEG5 = float(soup.find(text='PEG Ratio (5 yr expected)').findNext('td').string)
# print('Ticker %s, PEG Ratio (5 yr expected): %1.2f' %(ticker, PEG5)) # format 2 dig.
if 0 < float(PEG5) < 3:
#print 'PEG forward 5 years',PEG5
DE = float(soup.find(text='Total Debt/Equity (mrq):').findNext('td').string)
# print('Ticker %s, Total Debt/Equity (mrq): %1.2f' %(ticker, DE)) # format 2 dig.
#
# #if 0 < float(DE) < 2:
#
PE12 = float(soup.find(text='Trailing P/E (ttm, intraday):').findNext('td').string)
# print ('Trailing PE (12mo):', PE12)
if float(PE12) < 15:
evenBetter.append(ticker)
df_comp_KPIs = df_comp_KPIs.append({'Ticker':ticker,'Mark.Cap':mcap,'PriceToBook':pbr,'PEG5':PEG5,'TrailPE12Mo':PE12,'DeptToEquit':DE}, ignore_index = True) #
df_comp_KPIs = df_comp_KPIs.sort(['PriceToBook','PEG5'], ascending=[1,1]) # , ignore_index=True
print('____________________________')
print('')
print(ticker,'meets requirements')
print('Market Cap (intraday):', mcap)
print('price to book:',pbr)
print('PEG forward 5 years',PEG5)
print('Trailing PE (12mo):',PE12)
print('Debt to Equity:',DE)
print('____________________________')
# saving ticker KPIs to csv #
df_comp_KPIs.to_csv('df_company_KPIs.csv')
except Exception as e:
print ('failed in the main loop:',str(e))
pass
return mcap, pbr, PEG5, PE12, DE
if __name__ == '__main__':
for eachticker in ofInterest:
yahoostats(eachticker)
# time.sleep(.05)
print(evenBetter)
print()
print('Company screener finished in %.1f seconds' %(time.time()-start))
# Convert string with MarketCap to float in one go on the mcap col in df
df_comp_KPIs['Mark.Cap'].applymap(stringNoToFloat)
OK. It appears I ended up with this error because of a missing global def of my df. So the code could not see the df created outside the function.
So the beffing of my function should look like this instead of the above:
def yahoostats(ticker):
global df_comp_KPIs
try:
print ('doing',ticker)
url = 'http://finance.yahoo.com/q/ks?s='+ticker
page = urllib.request.urlopen(url)
This solves the issue and I can alter, call, or do anything else with my df.
I found the answer here:
http://eli.thegreenplace.net/2011/05/15/understanding-unboundlocalerror-in-python

Categories