How to extract the file Data in python - python

I wanted to extract the date from the given string on the basis of tag.
My string is -
DATE: 7/25/2017 DATE OPENED: 7/25/2017 RETURN DATE: 7/26/2017
NUMBER: 201707250008754 RATE: 10.00
I want something like this -
If I give "DATE" it should return 7/25/2017 only
if I give "RETURN DATE" it should return 7/26/2017
if I give the "NUMBER" it should return 201707250008754
and so on.
How we can achieve this in Python 2.7 (Note: Dates and numbers are always random in string"

You can create a dictionary from the string's contents with re:
import re
s = 'DATE: 7/25/2017 DATE OPENED: 7/25/2017 RETURN DATE: 7/26/2017 NUMBER: 201707250008754 RATE: 10.00'
results = re.findall('[a-zA-Z\s]+(?=:)|[\d/\.]+', s)
d = dict([re.sub('^\s+', '', results[i]), results[i+1]] for i in range(0, len(results), 2))
for i in ['DATE', 'RETURN DATE', 'NUMBER']:
print(d[i])
Output:
7/25/2017
7/26/2017
201707250008754

Use dict to map key (eg: 'DATE' ) to its value.
import re
s = '''DATE: 7/25/2017 DATE OPENED: 7/25/2017 RETURN DATE: 7/26/2017 NUMBER: 201707250008754 RATE: 10.00'''
items = re.findall('\s*(.*?)\:\s*([0-9/.]*)',s)
#[('DATE', '7/25/2017'), ('DATE OPENED', '7/25/2017'), ('RETURN DATE', '7/26/2017'), ('NUMBER', '201707250008754'), ('RATE', '10.00')]
info = dict(items)
#{'DATE': '7/25/2017', 'DATE OPENED': '7/25/2017', 'RETURN DATE': '7/26/2017', 'NUMBER': '201707250008754', 'RATE': '10.00'}
for key in ['DATE', 'RETURN DATE', 'NUMBER']:
print(info[key])

Related

KeyError when trying to access the mode of DataFrame columns

I am trying to run the following code:
import time
import pandas as pd
import numpy as np
CITY_DATA = {'chicago': 'chicago.csv',
'new york city': 'new_york_city.csv',
'washington': 'washington.csv'}
def get_filters():
"""
Asks user to specify a city, month, and day to analyze.
Returns:
(str) city - name of the city to analyze
(str) month - name of the month to filter by, or "all" to apply no month filter
(str) day - name of the day of week to filter by, or "all" to apply no day filter
"""
print('Hello! Let\'s explore some US bikeshare data!')
# get user input for city (chicago, new york city, washington). HINT: Use a while loop to handle invalid inputs
while True:
city = input('Which city you would like to explore : "chicago" , "new york city" , or "washington" :' )
if city not in ('chicago', 'new york city', 'washington'):
print(" You entered wrong choice , please try again")
continue
else:
break
# get user input for month (all, january, february, ... , june)
while True:
month = input('Enter "all" for all data or chose a month : "january" , "february" , "march", "april" , "may" or "june " :')
if month not in ("all", "january", "february", "march", "april", "may", "june"):
print(" You entered wrong choice , please try again")
continue
else:
break
# get user input for day of week (all, monday, tuesday, ... sunday)
while True:
day = input('Enter "all" for all days or chose a day : "saturday", "sunday", "monday", "tuesday", "wednesday", "thursday", "friday": ')
if day not in ("all","saturday", "sunday", "monday", "tuesday", "wednesday", "thursday", "friday"):
print(" You entered wrong choice , please try again")
continue
else:
break
print('-'*60)
return city, month, day
def load_data(city, month, day):
"""
Loads data for the specified city and filters by month and day if applicable.
Args:
(str) city - name of the city to analyze
(str) month - name of the month to filter by, or "all" to apply no month filter
(str) day - name of the day of week to filter by, or "all" to apply no day filter
Returns:
df - Pandas DataFrame containing city data filtered by month and day
"""
df = pd.read_csv(CITY_DATA[city])
# convert the Start Time column to datetime
df['Start Time'] = pd.to_datetime(df['Start Time'])
# extract month , day of week , and hour from Start Time to new columns
df['month'] = df['Start Time'].dt.month
df['day_of_week'] = df['Start Time'].dt.day_name
df['hour'] = df['Start Time'].dt.hour
# filter by month if applicable
if month != 'all':
# use the index of the month_list to get the corresponding int
months = ['january', 'february', 'march', 'april', 'may', 'june']
month = months.index(month) + 1
# filter by month to create the new dataframe
df = df[df['month'] == month]
# filter by day of week if applicable
if day != 'all':
# filter by day of week to create the new dataframe
df = df[df['day_of_week'] == day.title()]
return df
def time_stats(df):
"""Displays statistics on the most frequent times of travel."""
print('\nCalculating The Most Frequent Times of Travel...\n')
start_time = time.time()
# display the most common month
popular_month = df['month'].mode()[0]
print('\n The most popular month is : \n', popular_month)
# display the most common day of week
popular_day = df['day_of_week'].mode()[0]
print('\n The most popular day of the week is : \n', str(popular_day))
# display the most common start hour
popular_hour = df['hour'].mode()[0]
print('\n The most popular hour of the day is :\n ', popular_hour)
print("\nThis took %s seconds.\n" % (time.time() - start_time))
print('-'*60)
def station_stats(df):
"""Displays statistics on the most popular stations and trip."""
print('\nCalculating The Most Popular Stations and Trip...\n')
start_time = time.time()
# display most commonly used start station
start_station = df['Start Station'].value_counts().idxmax()
print('\n The most commonly used start station is : \n', start_station)
# display most commonly used end station
end_station = df['End Station'].value_counts().idxmax()
print('\nThe most commonly used end station is: \n', end_station)
# display most frequent combination of start station and end station trip
combination = df.groupby(['Start Station','End Station']).value_counts().idxmax()
print('\nThe most frequent combination of start station and end station are: \n', combination)
print("\nThis took %s seconds." % (time.time() - start_time))
print('-'*40)
def trip_duration_stats(df):
"""Displays statistics on the total and average trip duration."""
start_time = time.time()
travel_time = sum(df['Trip Duration'])
print('Total travel time:', travel_time / 86400, " Days")
# display total travel time
total_time = sum(df['Trip Duration'])
print('\nThe total travel time is {} seconds: \n', total_time)
# display mean travel time
mean_time = df['Trip Duration'].mean()
print('\n The average travel time is \n', mean_time)
print("\nThis took %s seconds." % (time.time() - start_time))
print('-'*40)
def user_stats(df):
"""Displays statistics on bikeshare users."""
print('\nCalculating User Stats...\n')
start_time = time.time()
# TO DO: Display counts of user types
user_types = df['User Type'].value_counts()
#print(user_types)
print('User Types:\n', user_types)
# TO DO: Display counts of gender
print("\nThis took %s seconds." % (time.time() - start_time))
print('-'*40)
def main():
while True:
city, month, day = get_filters()
df = load_data(city, month, day)
time_stats(df)
station_stats(df)
trip_duration_stats(df)
user_stats(df)
restart = input('\nWould you like to restart? Enter yes or no.\n')
if restart.lower() != 'yes':
break
if __name__ == "__main__":
main()
and I am receiving the following errors , can someone assist please
the errors:
> Traceback (most recent call last):
File "C:\Users\DELL\PycharmProjects\Professional\venv\Lib\site-packages\pandas\core\indexes\range.py", line 391, in get_loc
return self._range.index(new_key)
^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: 0 is not in range
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:\Users\DELL\PycharmProjects\Professional\Bikeshare.py", line 203, in <module>
main()
File "C:\Users\DELL\PycharmProjects\Professional\Bikeshare.py", line 192, in main
time_stats(df)
File "C:\Users\DELL\PycharmProjects\Professional\Bikeshare.py", line 100, in time_stats
popular_month = df['month'].mode()[0]
~~~~~~~~~~~~~~~~~~^^^
File "C:\Users\DELL\PycharmProjects\Professional\venv\Lib\site-packages\pandas\core\series.py", line 981, in __getitem__
Calculating The Most Frequent Times of Travel...
return self._get_value(key)
^^^^^^^^^^^^^^^^^^^^
File "C:\Users\DELL\PycharmProjects\Professional\venv\Lib\site-packages\pandas\core\series.py", line 1089, in _get_value
loc = self.index.get_loc(label)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\DELL\PycharmProjects\Professional\venv\Lib\site-packages\pandas\core\indexes\range.py", line 393, in get_loc
raise KeyError(key) from err
KeyError: 0
I am expecting to filter pandas DataFrame to return month, day of week, and hour to perform some statistics.
KeyError means that the key isn't valid, because it doesn't exist. In this case, one reason to get KeyError when trying to get first mode is when column 'month' in dataframe is empty, and therefore mode() returns an empty collection, so you get KeyError: 0 when trying to get its first element.
To avoid this, you could replace:
popular_month = df['month'].mode()[0]
With:
try:
# try to get first mode of column 'month'
popular_month = df['month'].mode()[0]
except KeyError:
# if there's no data on column 'month'
popular_month = "unknown"
Because if there's no data on 'month' column, there's no point in trying to get its mode.
More about handling exceptions: https://docs.python.org/3/tutorial/errors.html#handling-exceptions
Also when I tried to ( not use the filters) by choosing " all " in the second and 3rd input, I get the following result:
Calculating The Most Frequent Times of Travel...
The most popular month is :
6
The most popular day of the week is :
<bound method PandasDelegate._add_delegate_accessors.._create_delegator_method..f of <pandas.core.indexes.accessors.DatetimeProperties object at 0x0000022B7CD5E890>>
The most popular hour of the day is :
17
This took 0.0260775089263916 seconds.
Calculating The Most Popular Stations and Trip...
The most commonly used start station is :
Streeter Dr & Grand Ave
The most commonly used end station is:
Streeter Dr & Grand Ave
The most frequent combination of start station and end station are:
('2112 W Peterson Ave', '2112 W Peterson Ave', 1064651, Timestamp('2017-06-02 07:59:13'), '2017-06-02 08:25:42', 1589, 'Subscriber', 'Female', 1963.0, 6, <bound method PandasDelegate._add_delegate_accessors.._create_delegator_method..f of <pandas.core.indexes.accessors.DatetimeProperties object at 0x0000022B7CD5E890>>, 7)
This took 2.1254045963287354 seconds.
Total travel time: 3250.8308680555556 Days
The total travel time is {} seconds:
280871787
The average travel time is
936.23929
This took 0.06502270698547363 seconds.
Calculating User Stats...
User Types:
Subscriber 238889
Customer 61110
Dependent 1
Name: User Type, dtype: int64
This took 0.022009611129760742 seconds.
Would you like to restart? Enter yes or no.

Searching for a specific value within a list of dictionaries

I need to be able to print all instances of a name within the list of dictionaries. I can't seem to be able to print them in the desired format. It also doesn't work when it's in lowercase and the name is in uppercase.
def findContactsByName(name):
return [element for element in contacts if element['name'] == name]
def displayContactsByName(name):
print(findContactsByName(name))
if inp == 3:
print("Item 3 was selected: Find contact")
name = input("Enter name of contact to find: ")
displayContactsByName(name)
When the name 'Joe' was put in the output is:
[{'name': 'Joe', 'surname': ' Miceli', 'DOB': ' 25/06/2002', 'mobileNo': ' 79444425', 'locality': ' Zabbar'}, {'name': 'Joe', 'surname': 'Bruh', 'DOB': '12/12/2131', 'mobileNo': '77777777', 'locality': 'gozo'}]
When the name 'joe':
[]
Expected output:
name : Joe
surname : Miceli
DOB : 25/06/2002
mobileNo : 79444425
locality : Zabbar
name : Joe
surname : Bruh
DOB : 12/12/2131
mobileNo : 77777777
locality : gozo
Change the first function to:
def findContactsByName(name):
return [element for element in contacts if element['name'].lower() == name.lower()]
To account for the differences in uppercase and lowercase, I've just converted the name in the dictionary and the entered name to lowercase during the comparison part alone.
To be able to print it in the format that you've specified you could make a function for the same as follows:
def printResult(result):
for d in result:
print(f"name: {d['name']}")
print(f"surname: {d['surname']}")
print(f"DOB: {d['DOB']}")
print(f"mobileNo: {d['mobileNo']}")
print(f"locality: {d['locality']}")
print()
result=findContactsByName("joe")
printResult(result)
I modified your program. Now you don't have to worry about the case and the output formatting.
contacts = [{'name': 'Joe',
'surname': ' Miceli', 'DOB': ' 25/06/2002', 'mobileNo': ' 79444425', 'locality': ' Zabbar'},
{'name': 'Joe', 'surname': 'Bruh', 'DOB': '12/12/2131', 'mobileNo': '77777777', 'locality': 'gozo'}]
def findContactsByName(name):
return [element for element in contacts if element['name'].lower() == name.lower()]
def displayContactsByName(name):
for i in range(len(findContactsByName(name))):
for j in contacts[i]:
print('{}: {}'.format(j, contacts[i][j]))
print('\n')
displayContactsByName('Joe')
Case issue can be solved by setting each side of the comparison to UPPERCASE or LOWERCASE.
return [element for element in contacts if element['name'].upper() == name.upper()]
For the format of the print statement you could use the json module:
import json
print(json.dumps( findContactsByName(name), sort_keys=True, indent=4))

validation-remove currency symbol from price

I have this one string, which is actually price, this price value comes with any currency symbol (currency_list), I am trying to remove these currency symbols from price and return only price.\
Till now I am able to do it for prefix and suffix currency symbol using below code , everything works till here.
I just want to add one validation where if the symbol is not prefix or suffix like "200$434" in btw, then it should return not valid format. which I am not able to understand how should be implemented.
currency_list = ['USD', 'UNITED STATES DOLLAR', '$', 'EUR', 'EURO', '€', 'GBP','BRITISH POUND', '£']
Normally input string can be
"$1212212"
"1212212EURO"
"1212212"
"1212212 BRITISH POUND"
need help to validate values like "1212$343" or "1212212EURO323.23"
Code:
for symb in currency_list:
if symb in amount:
data = amount.replace(symb, '')
After going through multiple blog post, I found this answer which gets the job done.
def validateCurrency(amount):
new_amount=None
for cur in currency_list:
if amount.startswith(cur) or amount.endswith(cur):
new_amount = amount.replace(cur, "", 1)
if new_amount == None:
return "Currency is not valid a string."
return f"Price after removeing symbol is {new_amount}"
// print(validateCurrency('$1212212'))
You can use regex to achieve your purpose.
import re
currency_list = ['USD', 'UNITED STATES DOLLAR', '$', 'EUR', 'EURO', '€', 'GBP', 'BRITISH POUND', '£']
p = re.compile(r'([\D]*)([\d]+\.?[\d]+)(.*)')
def verify_or_get_amount(amount):
first, mid, last = [i.strip() for i in p.search(amount).groups()]
if (first and first not in currency_list) or (last and last not in currency_list):
print('invalid:', amount)
else:
amount = mid
print('amount:', amount)
return mid
for i in ['EURO123', 'EURO 123', 'EURO 123.', 'EURO .12', 'EURO 12.12', '$1212212', '1212212EURO', '1212212', '1212212 BRITISH POUND', '1212$343']:
verify_or_get_amount(i)
using regex:
import re
currency_list = ['USD', 'UNITED STATES DOLLAR', '\$', 'EUR', 'EURO', '€', 'GBP', 'BRITISH POUND', '£']
currencies = '|'.join(currency_list)
c = re.compile(rf'^({currencies})? *(\d+(\.\d+)?) *({currencies})?$')
for i in ['$1212212', '1212212EURO', '1212212', '1212212 BRITISH POUND', '1212$343']:
match_obj = c.match(i)
if match_obj:
print(match_obj.group(2))
else:
print('not found')
output :
1212212
1212212
1212212
1212212
not found
Explanation :
to see actual pattern : print(c.pattern) which gives :
^(USD|UNITED STATES DOLLAR|\$|EUR|EURO|€|GBP|BRITISH POUND|£)?(\d+(\.\d+)?) *(USD|UNITED STATES DOLLAR|\$|EUR|EURO|€|GBP|BRITISH POUND|£)?$
I've escaped $ in the currency_list.
currencies = '|'.join(currency_list) for building possible prefixes or suffixes.
(\d+(\.\d+)?) is for matching price which accept float as well. (you can omit the (\.\d+) part)
the * that you see in regex, is for for example BRITISH POUND which have a space after the number.
I am assuming you want a currency validation function
def validateCurrency(input):
input_length = len(input)
if input.isdigit():return False
split = [re.findall(r'(\D+?)(\d+)|(\d+?)(\D+)', input)[0] ]
total_length = 0
for i in split[0]:
if i in currency_list:
total_length+=len(i)
if str(i).isdigit():
total_length+=len(i)
if total_length == input_length:
return True
else:
return False

Convert string to date using datefinder

An issue occurs when I try to find a date in a .txt file using datefinder. I have the feeling I am unnecessarily switching between data types to obtain the result I desire.
Underneath is a MWE which results in generator object, which in turn is empty when changed to a list. I would like to obtain a datetime in the format %d-%m-%Y.
MWE:
import datefinder
f = ['this is text', 'this is a date', '* Model creation date: Sun Apr 25 08:52:06 2021']
for line in f:
if "creation date" in line:
date_line = str(line)
rev_date = datefinder.find_dates(_date_line)
dateutil's parser seems to do a better job:
import dateutil
f = ['this is text', 'this is a date', '* Model creation date: Sun Apr 25 08:52:06 2021']
dates = []
for line in f:
try:
dates.append(dateutil.parser.parse(line, fuzzy=True))
except dateutil.parser.ParserError:
pass
print(dates)
# [datetime.datetime(2021, 4, 25, 8, 52, 6)]
For the specific use-case:
for line in f:
if "* Model creation date:" in line:
rev_date = dateutil.parser.parse(line, fuzzy=True)
break
print(rev_date)
# 2021-04-25 08:52:06
Seems datefinder.find_dates works based on :. If you can remove : character after creation date get right result.
If always your string include creation date: you can remove this substring after if statement:
import datefinder
f = ['this is text', 'this is a date', '* Model creation date: Sun Apr 25 08:52:06 2021']
for line in f:
if "creation date" in line:
date_line = line.replace('creattion date:', '')
rev_date = datefinder.find_dates(date_line)

how to use the input with pandas to get all the value.count linked to this input

my dataframe looks like this:
Index(['#Organism/Name', 'TaxID', 'BioProject Accession', 'BioProject ID', 'Group', 'SubGroup', 'Size (Mb)', 'GC%', 'Replicons', 'WGS',
'Scaffolds', 'Genes', 'Proteins', 'Release Date', 'Modify Date',
'Status', 'Center', 'BioSample Accession', 'Assembly Accession',
'Reference', 'FTP Path', 'Pubmed ID', 'Strain'],
dtype='object')
I ask the user to enter the name of the species with this script :
print("bacterie species?")
species=input()
I want to look for the rows with "Organism/Name" equal to the species written by the user (input) then to calculate with "values.count" of the status column and finally to retrieve 'FTP Path'.
Here is the code that I could do but that does not work:
if (data.loc[(data["Organism/Name"]==species)
print(Data['Status'].value_counts())
else:
print("This species not found")
if (data.loc[(data["Organism/Name"]==species)
print(Data['Status'].value_counts())
else:
print(Data.get["FTP Path"]
If I understand your question correctly, this is what you're trying to achieve:
import wget
import numpy as np
import pandas as pd
URL='https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt'
data = pd.read_csv(wget.download(URL) , sep = '\t', header = 0)
species = input("Enter the bacteria species: ")
if data["#Organism/Name"].str.contains(species, case = False).any():
print(data.loc[data["#Organism/Name"].str.contains(species, case = False)]['Status'].value_counts())
FTP_list = data.loc[data["#Organism/Name"].str.contains(species, case = False)]["FTP Path"].values
else:
print("This species not found")
To wite all the FTP_Path urls into a txt file, you can do this:
with open('/path/urls.txt', mode='wt') as file:
file.write('\n'.join(FTP_list))

Categories