Related
I am trying to run the following code:
import time
import pandas as pd
import numpy as np
CITY_DATA = {'chicago': 'chicago.csv',
'new york city': 'new_york_city.csv',
'washington': 'washington.csv'}
def get_filters():
"""
Asks user to specify a city, month, and day to analyze.
Returns:
(str) city - name of the city to analyze
(str) month - name of the month to filter by, or "all" to apply no month filter
(str) day - name of the day of week to filter by, or "all" to apply no day filter
"""
print('Hello! Let\'s explore some US bikeshare data!')
# get user input for city (chicago, new york city, washington). HINT: Use a while loop to handle invalid inputs
while True:
city = input('Which city you would like to explore : "chicago" , "new york city" , or "washington" :' )
if city not in ('chicago', 'new york city', 'washington'):
print(" You entered wrong choice , please try again")
continue
else:
break
# get user input for month (all, january, february, ... , june)
while True:
month = input('Enter "all" for all data or chose a month : "january" , "february" , "march", "april" , "may" or "june " :')
if month not in ("all", "january", "february", "march", "april", "may", "june"):
print(" You entered wrong choice , please try again")
continue
else:
break
# get user input for day of week (all, monday, tuesday, ... sunday)
while True:
day = input('Enter "all" for all days or chose a day : "saturday", "sunday", "monday", "tuesday", "wednesday", "thursday", "friday": ')
if day not in ("all","saturday", "sunday", "monday", "tuesday", "wednesday", "thursday", "friday"):
print(" You entered wrong choice , please try again")
continue
else:
break
print('-'*60)
return city, month, day
def load_data(city, month, day):
"""
Loads data for the specified city and filters by month and day if applicable.
Args:
(str) city - name of the city to analyze
(str) month - name of the month to filter by, or "all" to apply no month filter
(str) day - name of the day of week to filter by, or "all" to apply no day filter
Returns:
df - Pandas DataFrame containing city data filtered by month and day
"""
df = pd.read_csv(CITY_DATA[city])
# convert the Start Time column to datetime
df['Start Time'] = pd.to_datetime(df['Start Time'])
# extract month , day of week , and hour from Start Time to new columns
df['month'] = df['Start Time'].dt.month
df['day_of_week'] = df['Start Time'].dt.day_name
df['hour'] = df['Start Time'].dt.hour
# filter by month if applicable
if month != 'all':
# use the index of the month_list to get the corresponding int
months = ['january', 'february', 'march', 'april', 'may', 'june']
month = months.index(month) + 1
# filter by month to create the new dataframe
df = df[df['month'] == month]
# filter by day of week if applicable
if day != 'all':
# filter by day of week to create the new dataframe
df = df[df['day_of_week'] == day.title()]
return df
def time_stats(df):
"""Displays statistics on the most frequent times of travel."""
print('\nCalculating The Most Frequent Times of Travel...\n')
start_time = time.time()
# display the most common month
popular_month = df['month'].mode()[0]
print('\n The most popular month is : \n', popular_month)
# display the most common day of week
popular_day = df['day_of_week'].mode()[0]
print('\n The most popular day of the week is : \n', str(popular_day))
# display the most common start hour
popular_hour = df['hour'].mode()[0]
print('\n The most popular hour of the day is :\n ', popular_hour)
print("\nThis took %s seconds.\n" % (time.time() - start_time))
print('-'*60)
def station_stats(df):
"""Displays statistics on the most popular stations and trip."""
print('\nCalculating The Most Popular Stations and Trip...\n')
start_time = time.time()
# display most commonly used start station
start_station = df['Start Station'].value_counts().idxmax()
print('\n The most commonly used start station is : \n', start_station)
# display most commonly used end station
end_station = df['End Station'].value_counts().idxmax()
print('\nThe most commonly used end station is: \n', end_station)
# display most frequent combination of start station and end station trip
combination = df.groupby(['Start Station','End Station']).value_counts().idxmax()
print('\nThe most frequent combination of start station and end station are: \n', combination)
print("\nThis took %s seconds." % (time.time() - start_time))
print('-'*40)
def trip_duration_stats(df):
"""Displays statistics on the total and average trip duration."""
start_time = time.time()
travel_time = sum(df['Trip Duration'])
print('Total travel time:', travel_time / 86400, " Days")
# display total travel time
total_time = sum(df['Trip Duration'])
print('\nThe total travel time is {} seconds: \n', total_time)
# display mean travel time
mean_time = df['Trip Duration'].mean()
print('\n The average travel time is \n', mean_time)
print("\nThis took %s seconds." % (time.time() - start_time))
print('-'*40)
def user_stats(df):
"""Displays statistics on bikeshare users."""
print('\nCalculating User Stats...\n')
start_time = time.time()
# TO DO: Display counts of user types
user_types = df['User Type'].value_counts()
#print(user_types)
print('User Types:\n', user_types)
# TO DO: Display counts of gender
print("\nThis took %s seconds." % (time.time() - start_time))
print('-'*40)
def main():
while True:
city, month, day = get_filters()
df = load_data(city, month, day)
time_stats(df)
station_stats(df)
trip_duration_stats(df)
user_stats(df)
restart = input('\nWould you like to restart? Enter yes or no.\n')
if restart.lower() != 'yes':
break
if __name__ == "__main__":
main()
and I am receiving the following errors , can someone assist please
the errors:
> Traceback (most recent call last):
File "C:\Users\DELL\PycharmProjects\Professional\venv\Lib\site-packages\pandas\core\indexes\range.py", line 391, in get_loc
return self._range.index(new_key)
^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: 0 is not in range
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\DELL\PycharmProjects\Professional\Bikeshare.py", line 203, in <module>
main()
File "C:\Users\DELL\PycharmProjects\Professional\Bikeshare.py", line 192, in main
time_stats(df)
File "C:\Users\DELL\PycharmProjects\Professional\Bikeshare.py", line 100, in time_stats
popular_month = df['month'].mode()[0]
~~~~~~~~~~~~~~~~~~^^^
File "C:\Users\DELL\PycharmProjects\Professional\venv\Lib\site-packages\pandas\core\series.py", line 981, in __getitem__
Calculating The Most Frequent Times of Travel...
return self._get_value(key)
^^^^^^^^^^^^^^^^^^^^
File "C:\Users\DELL\PycharmProjects\Professional\venv\Lib\site-packages\pandas\core\series.py", line 1089, in _get_value
loc = self.index.get_loc(label)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\DELL\PycharmProjects\Professional\venv\Lib\site-packages\pandas\core\indexes\range.py", line 393, in get_loc
raise KeyError(key) from err
KeyError: 0
I am expecting to filter pandas DataFrame to return month, day of week, and hour to perform some statistics.
KeyError means that the key isn't valid, because it doesn't exist. In this case, one reason to get KeyError when trying to get first mode is when column 'month' in dataframe is empty, and therefore mode() returns an empty collection, so you get KeyError: 0 when trying to get its first element.
To avoid this, you could replace:
popular_month = df['month'].mode()[0]
With:
try:
# try to get first mode of column 'month'
popular_month = df['month'].mode()[0]
except KeyError:
# if there's no data on column 'month'
popular_month = "unknown"
Because if there's no data on 'month' column, there's no point in trying to get its mode.
More about handling exceptions: https://docs.python.org/3/tutorial/errors.html#handling-exceptions
Also when I tried to ( not use the filters) by choosing " all " in the second and 3rd input, I get the following result:
Calculating The Most Frequent Times of Travel...
The most popular month is :
6
The most popular day of the week is :
<bound method PandasDelegate._add_delegate_accessors.._create_delegator_method..f of <pandas.core.indexes.accessors.DatetimeProperties object at 0x0000022B7CD5E890>>
The most popular hour of the day is :
17
This took 0.0260775089263916 seconds.
Calculating The Most Popular Stations and Trip...
The most commonly used start station is :
Streeter Dr & Grand Ave
The most commonly used end station is:
Streeter Dr & Grand Ave
The most frequent combination of start station and end station are:
('2112 W Peterson Ave', '2112 W Peterson Ave', 1064651, Timestamp('2017-06-02 07:59:13'), '2017-06-02 08:25:42', 1589, 'Subscriber', 'Female', 1963.0, 6, <bound method PandasDelegate._add_delegate_accessors.._create_delegator_method..f of <pandas.core.indexes.accessors.DatetimeProperties object at 0x0000022B7CD5E890>>, 7)
This took 2.1254045963287354 seconds.
Total travel time: 3250.8308680555556 Days
The total travel time is {} seconds:
280871787
The average travel time is
936.23929
This took 0.06502270698547363 seconds.
Calculating User Stats...
User Types:
Subscriber 238889
Customer 61110
Dependent 1
Name: User Type, dtype: int64
This took 0.022009611129760742 seconds.
Would you like to restart? Enter yes or no.
import os, datetime
content = os.listdir("when")
print(content)
#this print...
#['2022_-_12_-_29 12pp33 am _--_ 2023_-_01_-_25 19pp13 pm.txt', '2023-02-05 00pp00 am.txt']
for i in range(len(content)):
content[i] = content[i].replace("_-_", "-").replace("pp", ":")
print(content) #I prepare the input to use it to search
#this print...
#['2022-12-29 12:33 am _--_ 2023-01-25 19:13 pm.txt', '2023-02-05 00:00 am.txt']
input_to_search_in_folder = "2022_-_01_-_05 12:33 am" #file data to find in the 'when' folder
I have changed the : to pp (referring to point-point) because you cannot place : in folders or/and files, at least not in Windows
2022_-_12_-_29 12pp33 am _--_ 2023_-_01_-_25 19pp13 pm
initial date _--_ final date
In this case input_to_search_in_folder = "2022_-_01_-_05 12:33 am" does not match a file with a specific date name. But if it belongs to the interval of days indicated in the file name '2022_-_12_-_29 12pp33 am _--_ 2023_-_01_-_25 19pp13 pm.txt'
How could I validate that this date "2022_-_01_-_05 12:33 am" does belong to that time interval '2022_-_12_-_29 12pp33 am _--_ 2023_-_01_-_25 19pp13 pm' or if it's this date '2023-02-05 00:00 am'?
If the validation is successful, the program should print the content inside that .txt (in this case inside the 2022_-_12_-_29 12pp33 am _--_ 2023_-_01_-_25 19pp13 pm.txt )
text_file = open("when/" + , "r")
data_inside_this_file = text_file.read()
text_file.close()
#And finally prints the content of the .txt file that matches the date specified in the 'input_to_search_in_folder' variable
print(repr(data_inside_this_file))
I would clean the strings fully, convert them to datetime objects (because these can be compared to each other), then compare then and you have the result and can do whatever with it:
import os
from datetime import datetime
content = os.listdir("when")
print(content)
#['2022_-_12_-_29 12pp33 am _--_ 2023_-_01_-_25 19pp13 pm.txt', '2023-02-05 00pp00 am.txt']
for i in range(len(content)):
content[i] = content[i].replace("_-_", "-").replace("pp", ":")
#['2022-12-29 12:33 am _--_ 2023-01-25 19:13 pm.txt', '2023-02-05 00:00 am.txt']
cleaned_filename = os.path.splitext(content[0])[0] #="2022-12-29 12:33 am _--_ 2023-01-25 19:13 pm"
start_dt = datetime.strptime(content[0].split(" _--_ ")[0], "%Y-%m-%d %H:%M")
#="2022-12-29 12:33 am" = datetime(2022, 12, 29, 12, 33)
last_dt = datetime.strptime(content[0].split(" _--_ ")[1], "%Y-%m-%d %H:%M")
#="2023-01-25 19:13 pm"
third_dt = datetime.strptime(os.path.splitext(content[1])[0], "%Y-%m-%d %H:%M")
#="2023-02-05 00:00 am"
input_to_search = "2022_-_01_-_05 12:33 am".replace("_-_", "-")
#"2022-01-05 12:33 am".
input_dt = datetime.strptime(input_to_search, "%Y-%m-%d %H:%M")
#="datetime(2022, 01, 05, 12, 33)"
if start_dt <= input_dt <= last_dt:
print("in between")
elif input_dt == third_dt:
print("Match")
else:
print("No!")
A way is to extract the dates using regex and then convert them as date like mrblue6's answer:
#!/usr/bin/python3
from datetime import datetime
import re
# Let's assume this is one of the directory entries
direntry='2022_-_12_-_29 12pp33 am _--_ 2023_-_01_-_25 19pp13 pm.txt'
# We exclude in the regex the AM/PM part since the format is 24-hour clock
datePattern = '(\d{4}_-_\d{2}_-_\d{2} \d{2}pp\d{2}) [ap]m'
dirPattern = f'{datePattern} _--_ {datePattern}.txt'
# Let's extract the "milestone" dates
matches = re.search(dirPattern, direntry)
extractedDate1 = matches.group(1)
extractedDate2 = matches.group(2)
# Let's extract the date to check
matches = re.search(datePattern, "2022_-_01_-_05 12pp33 am")
extractedDateToCheck = matches.group(1)
# Let's convert them as date time
readDateFormat = '%Y_-_%m_-_%d %Hpp%M'
date1 = datetime.strptime(extractedDate1, readDateFormat)
date2 = datetime.strptime(extractedDate2, readDateFormat)
dateToCheck = datetime.strptime(extractedDateToCheck, readDateFormat)
# Let's compare them
print (f"Date 1 : {date1}")
print (f"Date 2 : {date2}")
print (f"Date to check: {dateToCheck}")
print (f"Check: {date1 <= dateToCheck <= date2}" )
Output:
Date 1 : 2022-12-29 12:33:00
Date 2 : 2023-01-25 19:13:00
Date to check: 2022-01-05 12:33:00
Check: False
I find this a bit hard to explain, however.
I have a very big json object I got as a response from an API. This has about a years worth of data inside of each key. There are 3 keys prices, market_cap, total_volume.
Each key consists of the same; a list with N lists inside which store a UNIX timestamp and a price.
N = 24 per day (in this case my data is a years worth so 365 * 24)
Now I need to convert all of these UNIX timestamps back to readable datetime.
I want to do this for each key as each key has one large list with in there more lists which consists of a unix timestamp and another int.
I want the output to be either another dict or the same dict altered to then later use this data.
Json object
{'prices': [[1581984000000, 9723.605026422496], [1582070400000, 10133.817417084678], [1582156800000, 9618.409248250406], [1582243200000, 9608.16918128246], [1582329600000, 9673.011800270346], [1582416000000, 9658.606058375473], [1582502400000, 9946.716123467522], [1582588800000, 9670.9608064677], [1582675200000, 9345.342701734848], [1582761600000, 8801.657129472504], [1582848000000, 8781.175282202359], [1582934400000, 8717.331431359424], [1583020800000, 8552.989118581636], [1583107200000, 8567.802248679225], [1583193600000, 8905.876104262194], [1583280000000, 8756.718576742549], [1583366400000, 8758.646993191916], [1583452800000, 9038.870323233425], [1583539200000, 9135.84506603321], [1583625600000, 8902.20195010789], [1583712000000, 8041.365538071835], [1583798400000, 7921.33200691072], [1583884800000, 7906.731528510632], [1583971200000, 7935.522040170545], [1584057600000, 5142.990459018316], [1584144000000, 5542.819542373153], [1584230400000, 5214.189112383918], [1584316800000, 5397.93335743919], [1584403200000, 5032.501351487721], [1584489600000, 5389.41577503409], [1584576000000, 5376.280747845438], [1584662400000, 6170.200600237675], [1584748800000, 6195.603505257629], [1584835200000, 6145.770891252399], [1584921600000, 5859.647430299388], [1585008000000, 6456.347485963419], [1585094400000, 6730.173782371188], [1585180800000, 6695.9006183977235], [1585267200000, 6765.56207892991], [1585353600000, 6397.826328325604], [1585440000000, 6255.000398693634], [1585526400000, 5915.337154791903], [1585612800000, 6403.141235565223], [1585699200000, 6421.70541388854], [1585785600000, 6640.797666310131], [1585872000000, 6807.897017890566], [1585958400000, 6732.852018394612], [1586044800000, 6859.424923721944], [1586131200000, 6788.048272605917], [1586217600000, 7297.635558289496], [1586304000000, 7196.782202442051], [1586390400000, 7342.291601148024], [1586476800000, 7294.488875121554], [1586563200000, 6864.694257006497], [1586649600000, 6878.781212589853], [1586736000000, 6913.158787469097], [1586822400000, 6857.538537511484], [1586908800000, 6860.17853570111], [1586995200000, 6629.431738031291], [1587081600000, 7059.92622475854], [1587168000000, 7035.261503989225], [1587254400000, 7242.5109294929825], [1587340800000, 7127.511949689152], [1587427200000, 6856.456278354705], [1587513600000, 6842.038597634602], [1587600000000, 7109.995291181778], [1587686400000, 7382.793144116689], [1587772800000, 7495.393587498606], [1587859200000, 7538.557687279841], [1587945600000, 7683.867415083342], [1588032000000, 7774.281554448049], [1588118400000, 7758.230255185947], [1588204800000, 8744.430287016561], [1588291200000, 8610.63580374089], [1588377600000, 8824.818413551968], [1588464000000, 8966.307014689282], [1588550400000, 8888.671912686868], [1588636800000, 8884.407813577056], [1588723200000, 9003.240557621584], [1588809600000, 9144.68703972007], [1588896000000, 9959.166416261767], [1588982400000, 9821.81131529702], [1589068800000, 9566.777187205966], [1589155200000, 8752.617087745832], [1589241600000, 8604.75159101983], [1589328000000, 8788.466749414652], [1589414400000, 9283.08601265873], [1589500800000, 9796.494527024528], [1589587200000, 9309.29535940684], [1589673600000, 9375.29710843331], [1589760000000, 9666.32719340344], [1589846400000, 9708.439858793108], [1589932800000, 9760.198937162193], [1590019200000, 9526.50759300584], [1590105600000, 9059.962506871727], [1590192000000, 9131.767275081993], [1590278400000, 9170.361063506127], [1590364800000, 8731.848525870651], [1590451200000, 8883.691769863415], [1590537600000, 8839.130663273247], [1590624000000, 9174.118563996424], [1590710400000, 9546.04563503715], [1590796800000, 9427.120373393418], [1590883200000, 9662.70587254818], [1590969600000, 9466.961781429516], [1591056000000, 10167.93069332851], [1591142400000, 9515.243858655718], [1591228800000, 9645.227869360308], [1591315200000, 9776.20299178848], [1591401600000, 9636.965527050057], [1591488000000, 9662.858709002241], [1591574400000, 9738.603356828593], [1591660800000, 9773.02951309516], [1591747200000, 9767.00531665552], [1591833600000, 9874.898681832236], [1591920000000, 9325.996856202635], [1592006400000, 9469.533297509908], [1592092800000, 9469.473456163696], [1592179200000, 9345.960907722063], [1592265600000, 9431.719262201745], [1592352000000, 9524.92661691022], [1592438400000, 9463.361414311787], [1592524800000, 9399.767217129216], [1592611200000, 9312.780104497786], [1592697600000, 9360.247968201687], [1592784000000, 9298.360829121417], [1592870400000, 9678.683208975835], [1592956800000, 9624.684291831398], [1593043200000, 9288.061774486938], [1593129600000, 9258.667161007706], [1593216000000, 9166.486360416233], [1593302400000, 9013.90556467614], [1593388800000, 9139.903276297824], [1593475200000, 9185.166540651147], [1593561600000, 9149.721996758017], [1593648000000, 9230.672998590804], [1593734400000, 9094.318072166905], [1593820800000, 9071.3850427828], [1593907200000, 9132.908369533492], [1593993600000, 9087.407312582163], [1594080000000, 9342.376492626678], [1594166400000, 9253.630980242333], [1594252800000, 9432.172515827939], [1594339200000, 9235.716302064242], [1594425600000, 9282.913638839902], [1594512000000, 9234.314674712627], [1594598400000, 9297.479635872663], [1594684800000, 9240.76251972468], [1594771200000, 9247.060695963813], [1594857600000, 9203.371435179699], [1594944000000, 9136.483376363976], [1595030400000, 9156.276583115488], [1595116800000, 9168.402736564132], [1595203200000, 9202.615839500108], [1595289600000, 9163.159654576915], [1595376000000, 9384.379751903267], [1595462400000, 9514.304987626969], [1595548800000, 9589.81771944117], [1595635200000, 9535.93879573746], [1595721600000, 9691.825138917147], [1595808000000, 9925.751397476346], [1595894400000, 10962.258481207355], [1595980800000, 10904.916526918994], [1596067200000, 11093.612240442404], [1596153600000, 11116.307163685275], [1596240000000, 11325.5515272739], [1596326400000, 11812.094307268515], [1596412800000, 11066.306240590267], [1596499200000, 11230.907762749297], [1596585600000, 11181.917508034885], [1596672000000, 11719.26352395155], [1596758400000, 11768.127742240009], [1596844800000, 11571.487980683192], [1596931200000, 11739.131006414418], [1597017600000, 11682.851469154939], [1597104000000, 11862.938012702563], [1597190400000, 11398.671060896633], [1597276800000, 11579.867951602135], [1597363200000, 11817.164038803397], [1597449600000, 11777.391322489924], [1597536000000, 11864.905810156475], [1597622400000, 11901.776488302461], [1597708800000, 12272.465808160425], [1597795200000, 11949.610970628193], [1597881600000, 11733.278970862082], [1597968000000, 11861.83657727968], [1598054400000, 11515.124298729217], [1598140800000, 11676.385305081287], [1598227200000, 11647.928120934363], [1598313600000, 11758.828120368864], [1598400000000, 11350.753473213], [1598486400000, 11465.002564032086], [1598572800000, 11300.398363810944], [1598659200000, 11519.118388160729], [1598745600000, 11481.481823317012], [1598832000000, 11701.004008657852], [1598918400000, 11672.324104943627], [1599004800000, 11895.225345345636], [1599091200000, 11418.254756916149], [1599177600000, 10197.459822768922], [1599264000000, 10484.470392265588], [1599350400000, 10177.789718049991], [1599436800000, 10260.0177277544], [1599523200000, 10359.445216989981], [1599609600000, 10125.014956069688], [1599696000000, 10230.154699360752], [1599782400000, 10342.159391205681], [1599868800000, 10378.223044584596], [1599955200000, 10439.38467226404], [1600041600000, 10328.866065987393], [1600128000000, 10661.096235144483], [1600214400000, 10787.58020807624], [1600300800000, 10952.249969107099], [1600387200000, 10937.996396960929], [1600473600000, 10927.150310293275], [1600560000000, 11083.99836119821]]}
had to decrease the size of the object due to stackoverflow not allowing the amount of characters of my post, had to remove 2 keys from the object.
My for loop to just get the lists from the key prices
for c in obj['prices']:
print(c)
# [1581905057626, 9860.162323271938]
# [1581908672576, 9853.961230334764]
# [1581912178268, 9803.988325218852]
# etc..
Now how would I go about changing the first value and then storing this all again in a new object with the same structure.
you don't need new object
from datetime import datetime
obj = {
'prices': [[1581984000000, 9723.605026422496],
[1582070400000, 10133.817417084678]]
}
for c in obj['prices']:
ts = int(c[0]/1000) # convert milisecond to second
dt = datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
c[0] = dt
print(obj)
result
{
"prices": [["2020-02-18 00:00:00", 9723.605026422496],
["2020-02-19 00:00:00", 10133.817417084678]]
}
I have a large amount of sport data that I am analyzing. I am trying to find the last location of the team (the time zone).
I have created a dictionary from the data which uses 'date' as the input (date of game) and timezone.
What I am trying to do is create the dictionary, then run code to check each line in data and find the previous game date (more specifically, the time zone at the previous date).
This is my code so far
import os
import re
from datetime import datetime, timedelta
input_folder = '2018files' # path of folder containing the multiple text files
# create a list with file names
data_files = [os.path.join(input_folder, file) for file in os.listdir(input_folder)]
# open csv file for writing
csv = open('myoutput2.csv', 'w')
def write_to_csv(line):
print(line)
csv.write(line)
for file in data_files:
with open(file, 'r') as f: # use context manager to open files
for line in f:
lines = f.readlines()
i=0
while i < len(lines):
temp_array = lines[i].rstrip().split(",")
if temp_array[0] == "id":
j=0
game_id = temp_array[1]
awayteam = lines[i+2].rstrip().split(",")[2]
hometeam = lines[i+3].rstrip().split(",")[2]
date = lines[i+5].rstrip().split(",")[2]
winning_team=0
timezone=0
site=0
#find timezone
for z in range(i+4,i+5,1): #only check for site id's
temp_array4 = lines[z].rstrip().split(",") #create new array for site id's
site = temp_array4[2]
site_first_3_letters=site[:3]
if str(site_first_3_letters)== "ALB" or str(site_first_3_letters)=="ATL" or str(site_first_3_letters)=="BAL" or str(site_first_3_letters)=="BOS" or str(site_first_3_letters)=="BUF" or str(site_first_3_letters)=="CAN" or str(site_first_3_letters)=="CIN" or str(site_first_3_letters)=="CLE" or str(site_first_3_letters)=="CLL" or str(site_first_3_letters)=="COL" or str(site_first_3_letters)=="COV" or str(site_first_3_letters)=="DAY" or str(site_first_3_letters)=="DET" or str(site_first_3_letters)=="DOV" or str(site_first_3_letters)=="FOR" or str(site_first_3_letters)=="FTB" or str(site_first_3_letters)=="GEA" or str(site_first_3_letters)=="GLO" or str(site_first_3_letters)=="GRA" or str(site_first_3_letters)=="HAR" or str(site_first_3_letters)=="HRT" or str(site_first_3_letters)=="IND" or str(site_first_3_letters)=="IRO" or str(site_first_3_letters)=="JER" or str(site_first_3_letters)=="LBV" or str(site_first_3_letters)=="LOU" or str(site_first_3_letters)=="LUD" or str(site_first_3_letters)=="MAS" or str(site_first_3_letters)=="MIA" or str(site_first_3_letters)=="MID" or str(site_first_3_letters)=="MON" or str(site_first_3_letters)=="NYC" or str(site_first_3_letters)=="PHI" or str(site_first_3_letters)=="PIT" or str(site_first_3_letters)=="PRO" or str(site_first_3_letters)=="RIC" or str(site_first_3_letters)=="ROC" or str(site_first_3_letters)=="SAI" or str(site_first_3_letters)=="SJU" or str(site_first_3_letters)=="SPR" or str(site_first_3_letters)=="STP" or str(site_first_3_letters)=="SYR" or str(site_first_3_letters)=="THR" or str(site_first_3_letters)=="TOL" or str(site_first_3_letters)=="TOR" or str(site_first_3_letters)=="WAR" or str(site_first_3_letters)=="WAT" or str(site_first_3_letters)=="WAV" or str(site_first_3_letters)=="WEE" or str(site_first_3_letters)=="WIL" or str(site_first_3_letters)=="WNY" or str(site_first_3_letters)=="WOR":
timezone="GMT-4"
elif str(site_first_3_letters)=="ARL" or str(site_first_3_letters)=="CHI" or str(site_first_3_letters)=="HOU" or str(site_first_3_letters)=="KAN" or str(site_first_3_letters)=="KEO" or str(site_first_3_letters)=="MIL" or str(site_first_3_letters)=="MIN" or str(site_first_3_letters)=="MNT" or str(site_first_3_letters)=="RCK" or str(site_first_3_letters)=="STL":
timezone="GMT-5"
elif str(site_first_3_letters)=="DEN" or str(site_first_3_letters)=="PHO" or str(site_first_3_letters)=="WHE":
timezone="GMT-6"
elif str(site_first_3_letters)== "ANA" or str(site_first_3_letters)== "LAS" or str(site_first_3_letters)=="LOS" or str(site_first_3_letters)=="OAK" or str(site_first_3_letters)=="SAN" or str(site_first_3_letters)=="SEA" or str(site_first_3_letters)=="SFA" or str(site_first_3_letters)== "SFO" or str(site_first_3_letters)=="WAS":
timezone="GMT-7"
# First I need to create a dictionary with date and time zone details for all games ANA play
schedule_ANA=dict()
#ANA is the team I'm interested in
if str(hometeam) == "ANA" or str(awayteam) == "ANA":
schedule_ANA[date]=str(timezone)
# This finds the time zone of the game on that date
print (schedule_ANA)
# I want to find the time zone of the the last game
# I want to go back 1 day at a time
# until the the next game date/time zone occurs
current_day_object = datetime.strptime(date, '%Y/%m/%d')
previous_day_object = current_day_object
while True:
# move one day back
previous_day_object = previous_day_object - timedelta(days = 1)
# check if you have a game on that day
if previous_day_object.strftime('%Y/%m/%d') in schedule:
print("Date of Previous Game",previous_day_object)
break
#eg. if a game is on 28 June 2018 at GMT-7, no game on 29 or 30 June
# then another game at 1 July 2018 at GMT -4, I want to return GMT -7
# when 1 July 2018 is the key
#Write to the csv
output_for_csv2=(game_id,date,hometeam,
awayteam,str(site),
str(timezone))
csv.write(','.join(output_for_csv2) + '\n')
i=i+1
else:
i=i+1
j=0
count_of_plate_appearances=0
csv.close()
I am getting this error, even though there are dates before 2018/04/03.
{'2018/04/03': 'GMT-7'}
Traceback (most recent call last):
File "H:/2019sem2/egh400/Code/rev41_test.py", line 73, in <module>
previous_day_object = previous_day_object - timedelta(days = 1)
OverflowError: date value out of range
I think the issue is that I am potentially creating the dictionary as the lines of data are being read, so there is no previous date to compare it to.
Any recommendations on how to fix it?
.
.
.
.
Also, here is some data incase it's useful.
id,ANA201804020
version,2
info,visteam,CLE
info,hometeam,ANA
info,site,ANA01
info,date,2018/04/03
id,ANA201804030
version,2
info,visteam,CLE
info,hometeam,ANA
info,site,ANA01
info,date,2018/04/02
info,number,0
info,starttime,7:08PM
info,daynight,night
info,usedh,true
play,3,1,river003,32,*B*BCS11*B>X,8/F
data,er,parkb001,0
data,er,woodb004,0
id,ANA201804040
version,2
info,visteam,CLE
info,hometeam,ANA
info,site,ANA01
info,date,2018/04/04
info,number,0
info,starttime,1:08PM
info,daynight,day
info,usedh,true
info,umphome,dimum901
data,er,bedrc001,0
data,er,middk001,0
data,er,woodb004,0
data,er,parkb001,0
data,er,bardl001,0
data,er,ramin002,0
id,ANA201804060
version,2
info,visteam,OAK
info,hometeam,ANA
info,site,ANA01
info,date,2018/04/06
info,number,0
info,starttime,7:07PM
info,daynight,night
info,usedh,true
info,umphome,knigb901
start,ramij002,"J.C. Ramirez",1,0,1
play,1,0,joycm001,32,CBBFBFB,W
data,er,ramin002,1
id,ANA201804080
version,2
info,visteam,OAK
info,hometeam,ANA
info,site,ANA01
info,date,2018/04/08
info,number,0
info,starttime,1:08PM
info,daynight,day
data,er,woodb004,0
data,er,penaf002,1
id,ANA201804170
version,2
info,visteam,BOS
info,hometeam,ANA
info,site,ANA01
info,date,2018/04/17
info,number,0
info,starttime,7:09PM
info,daynight,night
info,usedh,true
info,umphome,carav901
info,ump1b,bakej902
data,er,bedrc001,1
data,er,johnj010,0
id,ANA201804180
version,2
info,visteam,BOS
info,hometeam,ANA
info,site,ANA01
info,date,2018/04/18
info,number,0
info,starttime,7:08PM
info,daynight,night
info,usedh,true
info,umphome,bakej902
data,er,woodb004,1
data,er,bedrc001,0
data,er,middk001,2
id,ANA201804190
version,2
info,visteam,BOS
info,hometeam,ANA
info,site,ANA01
info,date,2018/04/19
info,number,0
info,starttime,7:08PM
info,daynight,night
info,usedh,true
info,umphome,laynj901
data,er,bardl001,0
id,ANA201804200
version,2
info,visteam,SFN
info,hometeam,ANA
info,site,ANA01
info,date,2018/04/20
info,number,0
info,starttime,7:08PM
info,daynight,night
info,usedh,true
info,umphome,rippm901
info,ump1b,westj901
Your problem has nothing to do with the previous date. Here's a MCVE:
from datetime import datetime, timedelta
date = '2018/04/03'
previous_day_object = datetime.strptime(date, '%Y/%m/%d')
while True:
previous_day_object = previous_day_object - timedelta(days = 1)
You see that your loop never ends, or more precisely: ends with an error. You can remove one day, and one day, and ..., but when the date is finally the 1st of January of year 1 (0001-01-01), you can't remove that day anymore:
>>> import datetime
>>> d = datetime.date.min
>>> d.isoformat()
'0001-01-01'
>>> d - datetime.timedelta(days = 1)
Traceback (most recent call last):
...
OverflowError: date value out of range
So I have several log files, they are structured like this:
Sep 9 12:42:15 apollo sshd[25203]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=189.26.255.11
Sep 9 12:42:15 apollo sshd[25203]: pam_succeed_if(sshd:auth): error retrieving information about user ftpuser
Sep 9 12:42:17 apollo sshd[25203]: Failed password for invalid user ftpuser from 189.26.255.11 port 44061 ssh2
Sep 9 12:42:17 apollo sshd[25204]: Received disconnect from 189.26.255.11: 11: Bye Bye
Sep 9 19:12:46 apollo sshd[30349]: Did not receive identification string from 199.19.112.130
Sep 10 03:29:48 apollo unix_chkpwd[4549]: password check failed for user (root)
Sep 10 03:29:48 apollo sshd[4546]: pam_unix(sshd:auth): authentication failure; logname= uid=0 euid=0 tty=ssh ruser= rhost=221.12.29.170 user=root
Sep 10 03:29:51 apollo sshd[4546]: Failed password for root from 221.12.29.170 port 56907 ssh2
There are more dates and times, But this is an example. I was wondering how I would calculate the total time that the file covers. I've tried a few things, and have had about 5 hours of no success.
I tried this first, and it was close, but it didn't work like I wanted it to, it kept repeating dates:
with open(filename, 'r') as file1:
lines = file1.readlines()
for line in lines:
linelist = line.split()
date2 = int(linelist[1])
time2 = linelist[2]
print linelist[0], linelist[1], linelist[2]
if date1 == 0:
date1 = date2
dates.append(linelist[0] + ' ' + str(linelist[1]))
if date1 < date2:
date1 = date2
ttimes.append(datetime.strptime(str(ltime1), FMT) - datetime.strptime(str(time1), FMT))
time1 = '23:59:59'
ltime1 = '00:00:00'
dates.append(linelist[0] + ' ' + str(linelist[1]))
if time2 < time1:
time1 = time2
if time2 > ltime1:
ltime1 = time2
If the entries are in a chronological order, you can just look at the first and at the last entry:
entries = lines.split("\n")
first_date = entries[0].split("apollo")[0]
last_date = entries[len(entries)-1].split("apollo")[0]
We don't have the year, so I took the current year. Read all the lines, convert the month to month index, and parse each date.
Then sort it (so works even if logs mixed) and take first & last item. Substract. Enjoy.
from datetime import datetime
months = ["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
current_year = datetime.now().year
dates = list()
with open(filename, 'r') as file1:
for line in file1:
linelist = line.split()
if linelist: # filter out possible empty lines
linelist[0] = str(months.index(linelist[0])) # convert 3-letter months to index
date2 = int(linelist[1])
z=datetime.strptime(" ".join(linelist[0:3])+" "+str(current_year),"%m %d %H:%M:%S %Y") # compose & parse the date
dates.append(z) # store in list
dates.sort() # sort the list
first_date = dates[0]
last_date = dates[-1]
# print report & compute time span
print("start {}, end {}, time span {}".format(first_date,last_date,last_date-first_date))
result:
start 2016-09-09 12:42:15, end 2016-09-10 03:29:51, time span 14:47:36
Note that it won't work properly between december 31st and january the 1st because of the missing year info. I suppose we could make a guess if we find January & December in the log then assume that it's january from the next year. Unsupported yet.