Find which city has the highest number of trips - python

In a csv file, bikeshare data is available for three different cities: NYC, Chicago, Washington. I Need to find which city has the highest number of trips, and also which city has the highest proportion of trips made by subscribers (User_type).
Below is my code:
def number_of_trips(filename):
with open(filename, 'r') as f_in:
# set up csv reader object
reader = csv.DictReader(f_in)
# initialize count variables
ny_trips = 0
wh_trips = 0
ch_trips = 0
n_usertype = 0
# tally up ride types
for row in reader:
if row['city'] == 'NYC':
ny_trips += 1
elif row['city'] == 'Chicago':
ch_trips += 1
else:
wh_trips +=1
if wh_trips < ny_trips > ch_trips:
city = 'NYC'
elif ny_trips < wh_trips > ch_trips:
city = 'Chicago'
else:
city = 'Washington'
return city
# return tallies as a tuple
return(city, n_customers, n_total)
This is throwing an error: KeyError: 'city'.
I am very new to python - please guide me on how to achieve above the requirements.

You should consider using the pandas library.
import pandas as pd
## Reading the CSV
df = pd.read_cvs('file')
## Counting the values of each city entry (assuming 'city' is a column header)
df['city'].value_counts()
For the second piece, you can use a pivot table with the len as the aggfunc value. The documentation for pd.pivot_table is shown here.

Related

Getting past ZeroDivisionError in Python

So I'm trying to calculate some salary averages based on job types in Python.
I've managed to extract all necessary information from a csv and store them in variables, but when I come to calculating averages, there's a chance that some of the roles don't have any salaries (for example, the csv includes full time ai scientists but not contracted ai scientists). So when I try to calculate the average for a contracted ai scientist - I get a ZeroDivisionError.
I want Python to see the ZeroDivision Error and ignore it and continue computing averages - this doesn't work in a try/except block and I'm not sure where to proceed from here.
This is my code so far:
ft_ai_scientist = 0
count_ft_ai_scientist = 0
ct_ai_scientist = 0
count_ct_ai_scientist = 0
with open('/Users/xxx/Desktop/ds_salaries.csv', 'r') as f:
csv_reader = f.readlines()
for row in csv_reader:
new_row = row.split(',')
employment_type = new_row[3]
job_title = new_row[4]
salary_in_usd = new_row[7]
if employment_type == 'FT' and job_title == 'AI Scientist':
ft_ai_scientist += int(salary_in_usd)
count_ft_ai_scientist += 1
if employment_type == 'CT' and job_title == 'AI Scientist':
ct_ai_scientist += int(salary_in_usd)
count_ct_ai_scientist += 1
avg_ft_ai_scientist = ft_ai_scientist / count_ft_ai_scientist
avg_ct_ai_scientist = ct_ai_scientist / count_ct_ai_scientist
print(avg_ft_ai_scientist)
print(avg_ct_ai_scientist)

Pandas finding a text in row and assign a dummy variable value based on this

I have a data frame which contains a text column i.e. df["input"],
I would like to create a new variable which checks whether df["input"] column contains any of the word in a given list and assigns a value of 1 if previous dummy variable is equal to 0 (logic is 1) create a dummy variable that equals to zero 2) replace it to one if it contains any word in a given list and it was not contained in the previous lists.)
# Example lists
listings = ["amazon listing", "ecommerce", "products"]
scripting = ["subtitle", "film", "dubbing"]
medical = ["medical", "biotechnology", "dentist"]
df = pd.DataFrame({'input': ['amazon listing subtitle',
'medical',
'film biotechnology dentist']})
which looks like:
input
amazon listing subtitle
medical
film biotechnology dentist
final dataset should look like:
input listings scripting medical
amazon listing subtitle 1 0 0
medical 0 0 1
film biotechnology dentist 0 1 0
One possible implementation is to use str.contains in a loop to create the 3 columns, then use idxmax to get the column name (or the list name) of the first match, then create a dummy variable from these matches:
import numpy as np
d = {'listings':listings, 'scripting':scripting, 'medical':medical}
for k,v in d.items():
df[k] = df['input'].str.contains('|'.join(v))
arr = df[list(d)].to_numpy()
tmp = np.zeros(arr.shape, dtype='int8')
tmp[np.arange(len(arr)), arr.argmax(axis=1)] = arr.max(axis=1)
out = pd.DataFrame(tmp, columns=list(d)).combine_first(df)
But in this case, it might be more efficient to use a nested for-loop:
import re
def get_dummy_vars(col, lsts):
out = []
len_lsts = len(lsts)
for row in col:
tmp = []
# in the nested loop, we use the any function to check for the first match
# if there's a match, break the loop and pad 0s since we don't care if there's another match
for lst in lsts:
tmp.append(int(any(True for x in lst if re.search(fr"\b{x}\b", row))))
if tmp[-1]:
break
tmp += [0] * (len_lsts - len(tmp))
out.append(tmp)
return out
lsts = [listings, scripting, medical]
out = df.join(pd.DataFrame(get_dummy_vars(df['input'], lsts), columns=['listings', 'scripting', 'medical']))
Output:
input listings medical scripting
0 amazon listing subtitle 1 0 0
1 medical 0 1 0
2 film biotechnology dentist 0 0 1
Here is a simpler - more pandas vector style solution:
patterns = {} #<-- dictionary
patterns["listings"] = ["amazon listing", "ecommerce", "products"]
patterns["scripting"] = ["subtitle", "film", "dubbing"]
patterns["medical"] = ["medical", "biotechnology", "dentist"]
df = pd.DataFrame({'input': ['amazon listing subtitle',
'medical',
'film biotechnology dentist']})
#---------------------------------------------------------------#
# step 1, for each column create a reg-expression
for col, items in patterns.items():
# create a regex pattern (word1|word2|word3)
pattern = f"({'|'.join(items)})"
# find the pattern in the input column
df[col] = df['input'].str.contains(pattern, regex=True).astype(int)
# step 2, if the value to the left is 1, change its value to 0
## 2.1 create a mask
## shift the rows to the right,
## --> if the left column contains the same value as the current column: True, otherwise False
mask = (df == df.shift(axis=1)).values
# substract the mask from the df
## and clip the result --> negative values will become 0
df.iloc[:,1:] = np.clip( df[mask].iloc[:,1:] - mask[:,1:], 0, 1 )
print(df)
Result
input listings scripting medical
0 amazon listing subtitle 1 0 0
1 medical 0 0 1
2 film biotechnology dentist 0 1 0
Great question and good answers (I somehow missed it yesterday)! Here's another variation with .str.extractall():
search = {"listings": listings, "scripting": scripting, "medical": medical, "dummy": []}
pattern = "|".join(
f"(?P<{column}>" + "|".join(r"\b" + s + r"\b" for s in strings) + ")"
for column, strings in search.items()
)
result = (
df["input"].str.extractall(pattern).assign(dummy=True).groupby(level=0).any()
.idxmax(axis=1).str.get_dummies().drop(columns="dummy")
)

For loop: writing dictionary iteration output to excel using pandas?

I am trying to iterate over a dictionary that contains multiple row indexes in its values and then apply pd.nsmallest function to generate the top 3 smallest values for the multiple sets of row indexes that are in the dictionary. However, there seems to be something wrong with my for loop statement as I am overwriting the top 3 values over and over till the last set of values in the dictionary and so my final excel file output shows only 3 rows for the last run of the for loop.
When I use print statements this works as expected and I get an output for all 16 values in the dictionary but when writing to excel file it only gives me the output of the last run on the loop
import pandas as pd
from tabulate import tabulate
VA = pd.read_excel('Columnar BU P&L.xlsx', sheet_name = 'Variance by Co')
legcon = VA[['Expense', 'Consolidation', 'Exp Category']]
legcon['Variance Type'] = ['Unfavorable' if x < 0 else 'favorable' for x in legcon['Consolidation']]
d = {'Travel & Entertainment': [1,2,3,4,5,6,7,8,9,10,11], 'Office supplies & Expenses': [13,14,15,16,17],
'Professional Fees':[19,20,21,22,23], 'Fees & Assessments':[25,26,27], 'IT Expenses':[29],
'Bad Debt Expense':[31],'Miscellaneous expenses': [33,34,35,36,37],'Marketing Expenses':[40,41,42],
'Payroll & Related Expenses': [45,46,47,48,49,50,51,52,53,54,55,56], 'Total Utilities':[59,60],
'Total Equipment Maint, & Rental Expense': [63,64,65,66,67,68],'Total Mill Expense':[70,71,72,73,74,75,76,77],
'Total Taxes':[80,81],'Total Insurance Expense':[83,84,85],'Incentive Compensation':[88],
'Strategic Initiative':[89]}
Printing output directly works fine when I do this:
for key,value in d.items():
a = legcon.iloc[value][legcon.iloc[:,1]<0].nsmallest(3,'Consolidation')
print(a)
Expense Consolidation Exp Category Variance Type
5 Transportation - AIR -19054 Travel & Entertainment Unfavorable
9 Meals -9617 Travel & Entertainment Unfavorable
7 Lodging -9439 Travel & Entertainment Unfavorable
Expense Consolidation Exp Category Variance Type
26 Bank Charges -4320 Fees & Assessments Unfavorable
27 Finance Charges -1389 Fees & Assessments Unfavorable
25 Payroll Fees -1145 Fees & Assessments Unfavorable
However when I use the below code to write to excel:
writer = pd.ExcelWriter('testt.xlsx', engine = 'xlsxwriter')
row = 0
for key,value in d.items():
a = legcon.iloc[value][legcon.iloc[:,1]<0].nsmallest(3,'Consolidation')
for i in range(0,16):
a.to_excel(writer, sheet_name = 'test', startrow = row+4, index = False)
writer.save()
my output looks like this and does not show all the exp categories:
I would really appreciate any feedback on how to correct this. Thanks in advance!
With some help from a friend I just realized my silly mistake, there was no row iterator in my for loop for the output to print on the next lines and using the below code fixed the issue (initially i placed the row iterator within my df.to_excel statement):
writer = pd.ExcelWriter('testt.xlsx', engine = 'xlsxwriter')
row = 0
for key,value in d.items():
a = legcon.iloc[value][legcon.iloc[:,1]<0].nsmallest(3,'Consolidation')
a.to_excel(writer, sheet_name = 'Testt', startrow = row, index = False)
row = row+4
writer.save()

How to compare these data sets from a csv? Python 2.7

I have a project where I'm trying to create a program that will take a csv data set from www.transtats.gov which is a data set for airline flights in the US. My goal is to find the flight from one airport to another that had the worst delays overall, meaning it is the "worst flight". So far I have this:
`import csv
with open('826766072_T_ONTIME.csv') as csv_infile: #import and open CSV
reader = csv.DictReader(csv_infile)
total_delay = 0
flight_count = 0
flight_numbers = []
delay_totals = []
dest_list = [] #create empty list of destinations
for row in reader:
if row['ORIGIN'] == 'BOS': #only take flights leaving BOS
if row['FL_NUM'] not in flight_numbers:
flight_numbers.append(row['FL_NUM'])
if row['DEST'] not in dest_list: #if the dest is not already in the list
dest_list.append(row['DEST']) #append the dest to dest_list
for number in flight_numbers:
for row in reader:
if row['ORIGIN'] == 'BOS': #for flights leaving BOS
if row['FL_NUM'] == number:
if float(row['CANCELLED']) < 1: #if the flight is not cancelled
if float(row['DEP_DELAY']) >= 0: #and the delay is greater or equal to 0 (some flights had negative delay?)
total_delay += float(row['DEP_DELAY']) #add time of delay to total delay
flight_count += 1 #add the flight to total flight count
for row in reader:
for number in flight_numbers:
delay_totals.append(sum(row['DEP_DELAY']))`
I was thinking that I could create a list of flight numbers and a list of the total delays from those flight numbers and compare the two and see which flight had the highest delay total. What is the best way to go about comparing the two lists?
I'm not sure if I understand you correctly, but I think you should use dict for this purpose, where key is a 'FL_NUM' and value is total delay.
In general I want to eliminate loops in Python code. For files that aren't massive I'll typically read through a data file once and build up some dicts that I can analyze at the end. The below code isn't tested because I don't have the original data but follows the general pattern I would use.
Since a flight is identified by the origin, destination, and flight number I would capture them as a tuple and use that as the key in my dict.
from collections import defaultdict
flight_delays = defaultdict(list) # look this up if you aren't familiar
for row in reader:
if row['ORIGIN'] == 'BOS': #only take flights leaving BOS
if row['CANCELLED'] > 0:
flight = (row['ORIGIN'], row['DEST'], row['FL_NUM'])
flight_delays[flight].append(float(row['DEP_DELAY']))
# Finished reading through data, now I want to calculate average delays
worst_flight = ""
worst_delay = 0
for flight, delays in flight_delays.items():
average_delay = sum(delays) / len(delays)
if average_delay > worst_delay:
worst_flight = flight[0] + " to " + flight[1] + " on FL#" + flight[2]
worst_delay = average_delay
A very simple solution would be. Adding two new variables:
max_delay = 0
delay_flight = 0
# Change: if float(row['DEP_DELAY']) >= 0: FOR:
if float(row['DEP_DELAY']) > max_delay:
max_delay = float(row['DEP_DELAY'])
delay_flight = #save the row number or flight number for reference.

Odd Column returns when scraping with lxml

I am learning python and trying to build a scraper to glean parts data from a suppliers site. My issue now is that I am getting different column counts from my parsed table rows where I know that each row has the same column count. The issue has to be something I am overlooking and after two days of trying different things I am asking for a few more sets of eyes on my code to locate my error. Not having much python coding experience is no doubt my biggest hurdle.
First, the data. Rather than paste the html I have stored in my database, I'll give you a link to the live site I have crawled and stored in my db. The first link is this one.
The issue is that I get mostly correct results. However, every so often I get the values skewed in the column count. I can't seem to locate the cause.
Here is an example of the flawed result:
----------------------------------------------------------------------------------
Record: 1 Section:Passenger / Light Truck Make: ACURA SubMake:
Model: CL SubModel: Year: 1997 Engine: L4 1.6L 1590cc
----------------------------------------------------------------------------------
Rec:1 Row 6 Col 1 part Air Filter
Rec:1 Row 6 Col 2 2
Rec:1 Row 6 Col 3 part_no 46395
Rec:1 Row 6 Col 4 filter_loc
Rec:1 Row 6 Col 5 engine
Rec:1 Row 6 Col 6 vin_code V6 3.0L 2997cc
Rec:1 Row 6 Col 7 comment Engine Code J30A1
** Note that the engine value has been shifted to the vin_code field.
And proof it works some of the time:
Record: 2 Section:Passenger / Light Truck Make: ACURA SubMake:
Model: CL SubModel: Year: 1998 Engine: L4 1.6L 1590cc
----------------------------------------------------------------------------------
Rec:3 Row 4 Col 1 part Oil Filter
Rec:3 Row 4 Col 2 2
Rec:3 Row 4 Col 3 part_no 51334
Rec:3 Row 4 Col 4 filter_loc
Rec:3 Row 4 Col 5 engine L4 2.3L 2254cc
Rec:3 Row 4 Col 6 vin_code
Rec:3 Row 4 Col 7 comment Engine Code F23A1
** Note the fields line up in this record...
I suspect either there is something in the table cells my parser is not looking for or I have missed something trivial.
Here is the important portion of my code:
# Per Query
while records:
# Per Query Loop
#print str(records)
for record in records:
print 'Record Count:'+str(rec_cnt)
items = ()
item = {}
source = record['doc']
page = html.fromstring(source)
for rows in page.xpath('//div/table'):
#records = []
item = {}
cntx = 0
for row in list(rows):
cnty = 0 # Column Counter
found_oil = 0 # Found oil filter record flag
data = {} # Data
# Data fields
field_data = {'part':'', 'part_no':'', 'filter_loc':'', 'engine':'', 'vin_code':'', 'comment':'', 'year':''}
print
print '----------------------------------------------------------------------------------'
print 'Record: '+str(record['id']), 'Section:'+str(record['section']), 'Make: '+str(record['make']), 'SubMake: '+str(record['submake'])
print 'Model: '+str(record['model']), 'SubModel: '+str(record['submodel']), 'Year: '+str(record['year']), 'Engine: '+str(record['engine'])
print '----------------------------------------------------------------------------------'
#
# Rules for extracting data columns
# 1. First column always has a link to the bullet image
# 2. Second column is part name
# 3. Third column always empty
# 4. Fourth column is part number
# 5. Fith column is empty
# 6. Sixth column is part location
# 7. Seventh column is always empty
# 8. Eigth column is engine size
# 9. Ninth column is vin code
# 10. Tenth column is COmment
# 11. Eleventh column does not exist.
#
for column in row.xpath('./td[#class="blackmedium"][text()="0xa0"] | ./td[#class="blackmedium"][text()="\n"]/text() | ./td[#class="blackmeduim"]/img[#src]/text() | ./td[#class="blackmedium"][text()=""]/text() | ./td[#class="blackmedium"]/b/text() | ./td[#class="blackmedium"]/a/text() |./td[#class="blackmedium"]/text() | ./td[#class="blackmedium"][text()=" "]/text() | ./td[#class="blackmedium"][text()="&#160"]/text() | ./td[#class="blackmedium"][text()=None]/text()'):
#' | ./td[position()>1]/a/text() | ./td[position()>1]/text() | self::node()[position()=1]/td/text()'):
cnty+=1
if ('Oil Filter' == column.strip() or 'Air Filter' == column.strip()) and found_oil == 0:
found_oil = 1
if found_oil == 1:
print 'Rec:'+str(rec_cnt), 'Row '+str(cntx), 'Col '+str(cnty), _fields[cnty], column.strip()
#cnty+= 1
#print
else:
print 'Rec: '+str(rec_cnt), 'Col: '+str(cnty)
field_data[ str(_fields[cnty]) ] = str(column.strip())
#cnty = cnty+1
# Save data to db dest table
if found_oil == 1:
data['source_id'] = record['id']
data['section_id'] = record['section_id']
data['section'] = record['section']
data['make_id'] = record['make_id']
data['make'] = record['make']
data['submake_id'] = record['submake_id']
data['submake'] = record['submake']
data['model_id'] = record['model_id']
data['model'] = record['model']
data['submodel_id'] = record['submodel_id']
data['submodel'] = record['submodel']
data['year_id'] = record['year_id']
data['year'] = record['year']
data['engine_id'] = record['engine_id']
data['engine'] = record['engine']
data['part'] = field_data['part']
data['part_no'] = field_data['part_no']
data['filter_loc'] = field_data['filter_loc']
data['vin_code'] = field_data['vin_code']
data['comment'] = conn.escape_string(field_data['comment'])
data['url'] = record['url']
save_data(data)
print 'Filed Data:'
print field_data
cntx+=1
rec_cnt+=1
#End main per query loop
delay() # delay if wait was passed on cmd line
records = get_data()
has_offset = 1
#End Queries
Thank you all for your help and your eyes...
Usually when I run into a problem like this, I do two things:
Break the problem down into smaller chunks. Use python functions or classes to perform subsets of functionality so that you can test the functions individually for correctness.
Use the Python Debugger to inspect the code as it runs to understand where it's failing. For example, in this case, I would add import pdb; pdb.set_trace() before the line that says cnty+=1.
Then, when the code runs, you'll get an interactive interpreter at that point and you can inspect the various variables and discover why you're not getting what you expect.
A couple of tips for using pdb:
Use c to allow the program to continue (until the next breakpoint or set_trace); Use n to step to the next line in the program. Use q to raise an Exception (and usually abort).
Can you pass the details of your scrapping process? The intermittent failures could be based on the parsing of the html data.
The problem seems to be that your xpath expression searches for text nodes. No matches are found for empty cells, causing your code to "skip" columns. Try iterating over the td elements themselves, and then "look down" from the element to its contents. To get you started:
# just iterate over child elements of the row, which are always td
# use enumerate to easily get a counter for the columns
for col_no, td in enumerate(row, start=1):
# use the xpath function string() to get the string value for the element
# this will yield an empty string for empty elements
print col_no, td.xpath('string()')
Note that the use of the string() xpath function may in some cases be not enough/too simple for what you want. In your example, you may find something like <td><a>51334</a><sup>53</sup></td> (see oil filter). My example would give you "5133453", where you would seem to need "51334" (not sure if that was intentional or if you hadn't noticed the "missing" part, if you do want only the in the hyperlink, use td.findtext('a'))
I want to thank everyone who has given aid to me these past few days. All your input has resulted in a working application that I am now using. I wanted to post the resulting changes to my code so those who look here may find an answer or at least information on how they may also tackle their issue. Below is the rewritten portion of my code that solved the issues I was having:
#
# get_column_index()
# returns a dict of column names/column number pairs
#
def get_column_index(row):
index = {}
col_no = 0
td = None
name = ''
for col_no, td in enumerate(row, start=0):
mystr = str(td.xpath('string()').encode('ascii', 'replace'))
name = str.lower(mystr).replace(' ', '_')
idx = name.replace('.', '')
index[idx] = col_no
if int(options.verbose) > 2:
print 'Field Index:', str(index)
return index
def run():
global has_offset
records = get_data()
#print 'Records', records
rec_cnt = 0
# Per Query
while records:
# Per Query Loop
#print str(records)
for record in records:
if int(options.verbose) > 0:
print 'Record Count:'+str(rec_cnt)
items = ()
item = {}
source = record['doc']
page = html.fromstring(source)
col_index = {}
for rows in page.xpath('//div/table'):
#records = []
item = {}
cntx = 0
for row in list(rows):
data = {} # Data
found_oil = 0 #found proper part flag
# Data fields
field_data = {'part':'', 'part_no':'', 'part_note':'', 'filter_loc':'', 'engine':'', 'vin_code':'', 'comment':'', 'year':''}
if int(options.verbose) > 0:
print
print '----------------------------------------------------------------------------------'
print 'Row'+str(cntx), 'Record: '+str(record['id']), 'Section:'+str(record['section']), 'Make: '+str(record['make']), 'SubMake: '+str(record['submake'])
print 'Model: '+str(record['model']), 'SubModel: '+str(record['submodel']), 'Year: '+str(record['year']), 'Engine: '+str(record['engine'])
print '----------------------------------------------------------------------------------'
# get column indexes
if cntx == 1:
col_index = get_column_index(row)
if col_index != None and cntx > 1:
found_oil = 0
for col_no, td in enumerate(row):
if ('part' in col_index) and (col_no == col_index['part']):
part = td.xpath('string()').strip()
if 'Oil Filter' == part or 'Air Filter' == part or 'Fuel Filter' == part or 'Transmission Filter' == part:
found_oil = 1
field_data['part'] = td.xpath('string()').strip()
# Part Number
if ('part_no' in col_index) and (col_no == col_index['part_no']):
field_data['part_no'] = str(td.xpath('./a/text()')).strip().replace('[', '').replace(']', '').replace("'", '')
field_data['part_note'] = str(td.xpath('./sup/text()')).strip().replace('[', '').replace(']', '').replace("'", '')
# Filter Location
if ('filterloc' in col_index) and (col_no == col_index['filterloc']):
field_data['filter_loc'] = td.xpath('string()').strip()
# Engine
if ('engine' in col_index) and (col_no == col_index['engine']):
field_data['engine'] = td.xpath('string()').strip()
if ('vin_code' in col_index) and (col_no == col_index['vin_code']):
field_data['vin_code'] = td.xpath('string()').strip()
if ('comment' in col_index) and (col_no == col_index['comment']):
field_data['comment'] = td.xpath('string()').strip()
if int(options.verbose) == 0:
print ','
if int(options.verbose) > 0:
print 'Field Data: ', str(field_data)
elif int(options.verbose) == 0:
print '.'
# Save data to db dest table
if found_oil == 1:
data['source_id'] = record['id']
data['section_id'] = record['section_id']
data['section'] = record['section']
data['make_id'] = record['make_id']
data['make'] = record['make']
data['submake_id'] = record['submake_id']
data['submake'] = record['submake']
data['model_id'] = record['model_id']
data['model'] = record['model']
data['submodel_id'] = record['submodel_id']
data['submodel'] = record['submodel']
data['year_id'] = record['year_id']
data['year'] = record['year']
data['engine_id'] = record['engine_id']
data['engine'] = field_data['engine'] #record['engine']
data['part'] = field_data['part']
data['part_no'] = field_data['part_no']
data['part_note'] = field_data['part_note']
data['filter_loc'] = field_data['filter_loc']
data['vin_code'] = field_data['vin_code']
data['comment'] = conn.escape_string(field_data['comment'])
data['url'] = record['url']
save_data(data)
found_oil = 0
if int(options.verbose) > 2:
print 'Data:', str(data)
cntx+=1
rec_cnt+=1
#End main per query loop
delay() # delay if wait was passed on cmd line
records = get_data()
has_offset = 1
#End Queries

Categories