My project is relatively straight forward. I am attempting to create a web-scraping tool that retrieves a random event from any given wikipedia article for a given date. The format of the URL is: url = f"https://en.wikipedia.org/wiki/{month}_{day}" where the month is the full name of the month followed by the day.
What I'm trying to achieve:
What I'm trying to achieve specifically here is that if an invalid date such as June 31 or Feb 30 were input, then the function below stops and returns a Please provide a valid date as its output without an error message.
Attempted Solution:
I've tried this with an if statement mapping a set of months to a set of dates but it's pretty wonky, as shown bellow:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
month = 'January'
day = '99'
url = f"https://en.wikipedia.org/wiki/{month}_{day}"
thirty = [*range(1,31)]
thirty = [str(x) for x in thirty]
thirty_one = [*range(1,32)]
thirty_one = [str(x) for x in thirty_one]
twenty_nine = [*range(1,30)]
twenty_nine = [str(x) for x in twenty_nine]
soup = BeautifulSoup(requests.get(url).content, "html.parser")
def wikiscraper():
events = []
if (month == set(['April','June','September','November']) and day != set(thirty))| \
(month == set(['January','March','May','July','August','October','December']) and day != set(thirty_one))| \
(month == set(['February']) and day != set(twenty_nine)):
return print("Please provide a valid date")
else:
for li in soup.select("h3 + ul > li"):
if (h2 := li.find_previous("h2")) and (h2.find(id="Events")):
date, event = li.text.replace("–", "-").split(" - ", maxsplit=1)
events.append((date, event))
events = pd.DataFrame(events)
cols = ['year','event']
events = pd.DataFrame(events)
events.columns = cols
pd.options.display.max_colwidth = 300
events['combined'] = 'On this date in the year'+' '+events.year+' '+events.event
events = events[['combined']]
return events.sample()
wikiscraper()
which returns
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-130-b89d9001cf84> in <module>
----> 1 wikiscraper()
<ipython-input-129-2c547a995093> in wikiscraper()
18 cols = ['year','event']
19 events = pd.DataFrame(events)
---> 20 events.columns = cols
21
22 pd.options.display.max_colwidth = 300
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in __setattr__(self, name, value)
5150 try:
5151 object.__getattribute__(self, name)
-> 5152 return object.__setattr__(self, name, value)
5153 except AttributeError:
5154 pass
pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__()
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in _set_axis(self, axis, labels)
562 def _set_axis(self, axis: int, labels: Index) -> None:
563 labels = ensure_index(labels)
--> 564 self._mgr.set_axis(axis, labels)
565 self._clear_item_cache()
566
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in set_axis(self, axis, new_labels)
224
225 if new_len != old_len:
--> 226 raise ValueError(
227 f"Length mismatch: Expected axis has {old_len} elements, new "
228 f"values have {new_len} elements"
ValueError: Length mismatch: Expected axis has 0 elements, new values have 2 elements
I also tried it with assert which works, but I want to keep it clean without an assertion error and just have a printed output requesting a valid date. The if statement I'm sure is not a very "pythonic" way of doing it either, although getting to run with the desired output is the bigger priority.
My ultimate goal is to simply get the function to stop and Please provide a valid date if the string input is not compatible with real dates.
edit. The solution was simple
if (month in set(['April','June','September','November']) and day not in set(thirty))| \
(month in set(['January','March','May','July','August','October','December']) and day not in set(thirty_one))| \
(month in set(['February']) and day not in set(twenty_nine)):
return print("Please provide a valid date")
just use in and not == User Timus to the rescue on a dumb mistake.
Related
Im preparing a python code , to screen stocks from the SP500 , DOW and Nasdaq.
SP500 and DOW importing data stocks is working properly , but when I try to import Nasdaq always get similar error, related to timestamp.
See below:
My code:
import talib
from yahoo_fin.stock_info import get_data
import yahoo_fin.stock_info as si
from datetime import datetime
list = si.tickers_nasdaq()
# Get current date and time
now = datetime.now().strftime("%m_%d_%Y_%I_%M_%S")
end_date = datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
# Create file to save results
f = open(f'C:/Users/fco_j/OneDrive/Escritorio/CHAT GPT/Python/Reports/dow_results_{now}.csv', 'w')
# print table header to file
f.write('Ticker, ClosePrice, SMA200, SMA20, RSI, RelVol\n')
# Define cache_data function
def cache_data(data, stock):
data.to_pickle(f'C:/Users/fco_j/OneDrive/Escritorio/CHAT GPT/Python/Pickle/{stock}.pkl')
for stock in list:
# Download historical data for past year
data = si.get_data(stock, start_date=start_date, end_date=end_date)
last_price = data["close"][-1]
# Get 150 and 20 simple moving averages using Talib
sma150 = talib.SMA(data['close'], timeperiod=150)[-1]
sma20 = talib.SMA(data['close'], timeperiod=20)[-1]
rsi = talib.RSI(data['close'], timeperiod=14)
# Calculate Relative Volume
rel_vol = data['volume'] / talib.SMA(data['volume'].values.astype(float), timeperiod = 50)
# Cache data
cache_data(data, stock)
# Filter stocks with relative volume (time period 20) over 1
if last_price > sma150 and last_price > sma20 and rsi[-1] > 50 and rel_vol[-1] > 1:
# Print results to file
f.write(f"{stock},{last_price},{sma150},{sma20},{rsi[-1]},{rel_vol[-1]}\n")
f.close()
The error:
KeyError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_11208/2663596324.py in
26 for stock in dow_list:
27 # Download historical data for past year
---> 28 data = si.get_data(stock, start_date=start_date, end_date=end_date)
29 last_price = data["close"][-1]
30 # Get 150 and 20 simple moving averages using Talib
~\anaconda3\envs\PyFinance\lib\site-packages\yahoo_fin\stock_info.py in get_data(ticker, start_date, end_date, index_as_date, interval, headers)
98
99 # get the date info
--> 100 temp_time = data["chart"]["result"][0]["timestamp"]
101
102 if interval != "1m":
KeyError: 'timestamp'
The code is working with si.tickers_dow() and si.tickers_sp500() , but not with si.tickers_nasdaq() .
Not sure if a dataframe issue.
import pandas as pd
df = pd.read_csv("stocks.csv")
df = df.where(pd.notnull(df), None)
df["date"] = pd.to_datetime(df["date"])
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
m
This returns the following data
price 99614.04
dtype: float64
now when I try to use the variable 'm' I receive the following error
import pandas as pd
df = pd.read_csv("stocks.csv")
df = df.where(pd.notnull(df), None)
df["date"] = pd.to_datetime(df["date"])
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
df.loc[(df['market'] == "NASDAQ") & (df["price"] == m)]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_62032/1757287628.py in <module>
5 df["date"] = pd.to_datetime(df["date"])
6 m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
----> 7 df.loc[(df['market'] == "NASDAQ") & (df["price"] == m)]
~\Anaconda3\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
67 other = item_from_zerodim(other)
68
---> 69 return method(self, other)
70
71 return new_method
~\Anaconda3\lib\site-packages\pandas\core\arraylike.py in __eq__(self, other)
30 #unpack_zerodim_and_defer("__eq__")
31 def __eq__(self, other):
---> 32 return self._cmp_method(other, operator.eq)
33
34 #unpack_zerodim_and_defer("__ne__")
~\Anaconda3\lib\site-packages\pandas\core\series.py in _cmp_method(self, other, op)
5494
5495 if isinstance(other, Series) and not self._indexed_same(other):
-> 5496 raise ValueError("Can only compare identically-labeled Series objects")
5497
5498 lvalues = self._values
ValueError: Can only compare identically-labeled Series objects
But when I use the actual value for 'm' it works.
import pandas as pd
df = pd.read_csv("stocks.csv")
df = df.where(pd.notnull(df), None)
df["date"] = pd.to_datetime(df["date"])
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True)
df.loc[(df['market'] == "NASDAQ") & (df["price"] == 99614.04)]
id name price symbol industry market currency date
25 1abf2ffc-3396-4ed9-954d-956be97668c0 Brocade Communications Systems, Inc. 99614.04 BRCD Computer Communications Equipment NASDAQ PLN 2020-09-12
Could someone please explain why this interaction is playing out this way?
Return value is a Series, you can use
m = df.loc[df['market'] == "NASDAQ", 'price'].max(numeric_only=True)
# or
m = df.loc[df['market'] == "NASDAQ"].max(numeric_only=True).item()
Use the following instead, currently you're returning a Series because you're not specifying from which Column you want to take the max.
m = df[df['market'].eq("NASDAQ")]['price'].max()
I try to get data from google trends in a g sheet. First time it runned smoothly, second time not so much. I got an error called:
ValueError: No objects to concatenate
I searched this error on Stack Overflow before but couldn't find any solutions. I use the code displayed below:
!pip install Pytrends
!pip install pandas
!pip install pytrends --upgrade <---------Note: this solved a different error.
from pytrends.request import TrendReq
import pandas as pd
import time
startTime = time.time()
pytrend = TrendReq(hl='nl-NL', tz=360)
df = wb = gc.open_by_url('https://docs.google.com/spreadsheets/d/1QE1QilM-GDdQle6eVunepqG5RNWv39xO0By84C19Ehc/edit?usp=sharing')
sheet = wb.sheet1
df2 = sheet.col_values(5)
d_from = sheet.acell('B7').value
d_to = sheet.acell('B8').value
geo1 = sheet.acell('B10').value
dataset = []
for x in range(1,len(df2)):
keywords = [df2[x]]
pytrend.build_payload(
kw_list=keywords,
cat=0,
timeframe= str(d_from + " " + d_to),
geo= str(geo1))
data = pytrend.interest_over_time()
if not data.empty:
data = data.drop(labels=['isPartial'],axis='columns')
dataset.append(data)
result = pd.concat(dataset, axis=1)
result.to_csv('search_trends_DOWNLOAD_ME.csv')
!cp search_trends_DOWNLOAD_ME.csv "/content/drive/My Drive/Colab Notebooks/Output"
executionTime = (time.time() - startTime)
print('Execution time in sec.: ' + str(executionTime))
The error I got:
ValueError Traceback (most recent call last)
<ipython-input-5-b86c7b4df727> in <module>()
25 data = data.drop(labels=['isPartial'],axis='columns')
26 dataset.append(data)
---> 27 result = pd.concat(dataset, axis=1)
28 result.to_csv('search_trends_DOWNLOAD_ME.csv')
29 get_ipython().system('cp search_trends_DOWNLOAD_ME.csv "/content/drive/My Drive/Colab Notebooks/Output"')
1 frames
/usr/local/lib/python3.6/dist-packages/pandas/core/reshape/concat.py in __init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
327
328 if len(objs) == 0:
--> 329 raise ValueError("No objects to concatenate")
330
331 if keys is None:
ValueError: No objects to concatenate
The keywords I use are located in df = wb = gc.open_by_url. It is a g-sheet with the location, language and the keywords.
this happened to me earlier, it was just miss typing path\url of the file.
check the path again.
I'm new to the python language (python3.6). I try to have iris with longitude and latitude, but I have some errors that I can't correct.
What i want to achieve : I have coordinates (latitude and longitude) in a CSV file. In the other hand i have a geojsonfile where i have polygons shapes. I would like to see if my coordinates are contained in the range of each polygon of my file.
But i have some trouble with the code below, i don't understand the following error Error object of type 'NoneType' has no len() at the last block of code.
If my post miss some details please tell me, i'll be glad to add some informations to help you understand the situation :)
This is my code :
import json, csv
import numpy
from shapely.geometry import shape, Point
def readJson(url):
response = open(url)
return json.loads(response.read())
def readCSV(url):
response = open(url)
return csv.DictReader(response, delimiter=',')
def getIris():
"""
Returns a dictionary formed by the id of an iris and its coordinates.
"""
dict = {}
url = 'iris.json'
data2 = readJson(url)
for district in data2['features']:
dict[district['id']] = district['geometry']
return dict
def getPOIs():
"""
Returns a list of tuples of POIs lat/long coordinates.
"""
urls = [
"./result.csv",
]
POIs = []
for url in urls:
csv = readCSV(url)
for line in csv:
latitude = line.get('latitude', None)
longitude = line.get('longitude', None)
if latitude is not None and longitude is not None:
POIs.append((float(longitude), float(latitude)))
return POIs
def POIsInIris(iris, POIs):
"""
Returns a dictionary formed by the id of a iris and the number of POIs that falls in
this iris.
"""
dict = {}
for key, value in iris.items():
dict[key] = 0
polygon = shape(value)
for p in POIs:
point = Point(p[0], p[1])
# print point.wkt
if polygon.contains(point):
dict[key] += 1
return dict
if __name__ == '__main__':
# Geographical Features
iris_bbox = getIris()
iris_number = len(iris_bbox)
print ("Iris: ", iris_number)
print ("Reading POIs...")
POIs = getPOIs()
print (len(POIs))
print ("Done Reading POIs")
print ("Calculating POIs per Iris")
POIsPerIris = POIsInIris(iris_bbox, POIs)
for k,v in POIsPerIris.items():
print (k,v)
And the output :
Iris: 49404
Reading POIs...
0
Done Reading POIs
Calculating POIs per Iris
Moreover I have a 0 for print (len(POIs)), and I don't understand why.
Thank you a lot
Edit : Here is the full error message :
TypeError Traceback (most recent call last)
<ipython-input-55-247f0f9756f9> in <module>()
10 print ("Done Reading POIs")
11 print ("Calculating POIs per Iris")
---> 12 POIsPerIris = POIsInIris(iris_bbox, POIs)
13 for k,v in POIsPerIris.items():
14 print (k,v)
<ipython-input-54-0877c0182800> in POIsInIris(iris, POIs)
8 for key, value in iris.items():
9 dict[key] = 0
---> 10 polygon = shape(value)
11 for p in POIs:
12 point = Point(p[0], p[1])
~/anaconda3/lib/python3.6/site-packages/shapely/geometry/geo.py in shape(context)
39 return MultiLineString(ob["coordinates"])
40 elif geom_type == "multipolygon":
---> 41 return MultiPolygon(ob["coordinates"], context_type='geojson')
42 elif geom_type == "geometrycollection":
43 geoms = [shape(g) for g in ob.get("geometries", [])]
~/anaconda3/lib/python3.6/site-packages/shapely/geometry/multipolygon.py in __init__(self, polygons, context_type)
62 self._geom, self._ndim = geos_multipolygon_from_polygons(polygons)
63 elif context_type == 'geojson':
---> 64 self._geom, self._ndim = geos_multipolygon_from_py(polygons)
65
66 def shape_factory(self, *args):
~/anaconda3/lib/python3.6/site-packages/shapely/geometry/multipolygon.py in geos_multipolygon_from_py(ob)
136 assert L >= 1
137
--> 138 N = len(ob[0][0][0])
139 assert N == 2 or N == 3
140
TypeError: object of type 'NoneType' has no len()
I am attempting to read in a csv that has a new data entry every 15 minutes. From what I can gather, the reason I am returning this exception is because the date doesn't change every row, but the time does. However, the feed isn't reading in the time and I'm not sure how to fix that. Here is my code:
from pyalgotrade.feed import csvfeed
feed = csvfeed.Feed("Date","%d/%m/%Y")
feed.addValuesFromCSV("eurusd-15m-july-small.csv")
for dateTime, value in feed:
print (dateTime, value)
My csv look like this:
Date,Time,Open,High,Low,Close,Volume
07/08/2018,17:30:00,1.15994,1.15994,1.15961,1.15982,414
07/08/2018,17:45:00,1.15982,1.16001,1.15964,1.15996,485
Here is the full error:
(datetime.datetime(2018, 8, 7, 0, 0), {'High': 1.15994, 'Volume': 414.0,
'Low': 1.15961, 'Time': '17:30:00', 'Close': 1.15982, 'Open': 1.15994})
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-12-0cbdbe588a05> in <module>()
3 feed = csvfeed.Feed("Date","%d/%m/%Y")
4 feed.addValuesFromCSV("eurusd-15m-july-small.csv")
----> 5 for dateTime, value in feed:
6 print (dateTime, value)
/Users/Phil/anaconda2/lib/python2.7/site-
packages/pyalgotrade/feed/__init__.pyc in feed_iterator(feed)
29 try:
30 while not feed.eof():
---> 31 yield feed.getNextValuesAndUpdateDS()
32 finally:
33 feed.stop()
/Users/Phil/anaconda2/lib/python2.7/site-
packages/pyalgotrade/feed/__init__.pyc in getNextValuesAndUpdateDS(self)
88 ds = self.createDataSeries(key, self.__maxLen)
89 self.__ds[key] = ds
---> 90 ds.appendWithDateTime(dateTime, value)
91 return (dateTime, values)
92
/Users/Phil/anaconda2/lib/python2.7/site-
packages/pyalgotrade/dataseries/__init__.pyc in appendWithDateTime(self,
dateTime, value)
134
135 if dateTime is not None and len(self.__dateTimes) != 0 and
self.__dateTimes[-1] >= dateTime:
--> 136 raise Exception("Invalid datetime. It must be bigger than
that last one")
137
138 assert(len(self.__values) == len(self.__dateTimes))
Exception: Invalid datetime. It must be bigger than that last one
Thanks in advance!
The problem is that you have the datetime in two separate columns so as #daniel mentions you have to customize parsing. Try this:
import datetime
from pyalgotrade.feed import csvfeed
# Row parser that extracts the datetime combining 2 columns
class RowParser(csvfeed.RowParser):
def parseRow(self, csvRowDict):
dateTimeCombined = "%s %s" % (csvRowDict["Date"], csvRowDict["Time"])
dateTime = datetime.datetime.strptime(dateTimeCombined, "%d/%m/%Y %H:%M:%S")
# Convert the values
values = {}
for key, value in csvRowDict.items():
if key not in ["Date", "Time"]:
values[key] = csvfeed.float_or_string(key, value)
return (dateTime, values)
def getFieldNames(self):
return None
def getDelimiter(self):
return ","
feed = csvfeed.BaseFeed(RowParser())
feed.addValuesFromCSV("eurusd-15m-july-small.csv")
for dateTime, value in feed:
print (dateTime, value)