I am taking an Udacity programming course and have been sitting on the same problem for a week. I finally think I am close to getting it right, but I don't get the last objection. Here is my code:
def process_file(f):
# This is example of the datastructure you should return
# Each item in the list should be a dictionary containing all the relevant data
# Note - year, month, and the flight data should be integers
# You should skip the rows that contain the TOTAL data for a year
# data = [{"courier": "FL",
# "airport": "ATL",
# "year": 2012,
# "month": 12,
# "flights": {"domestic": 100,
# "international": 100}
# },
# {"courier": "..."}
# ]
data = []
info = {}
info["courier"], info["airport"] = f[:6].split("-")
with open("{}/{}".format(datadir, f), "r") as html:
soup = BeautifulSoup(html)
car = str(html)[17:19]
airp = str(html)[20:23]
mydict = {}
x = 0
table = soup.find("table", {"class": "dataTDRight"})
rows = table.find_all('tr')
for row in rows:
cells = row.find_all('td')
year = cells[0].get_text()
year = (year.encode('ascii'))
Month = cells[1].get_text()
Month = (Month.encode('ascii'))
domestic = cells[2].get_text()
domestic = (domestic.encode('ascii'))
international = cells[3].get_text()
international = (international.encode('ascii'))
if Month != "Month" and Month != "TOTAL":
Month = int(Month)
year = int(year)
domestic = int(domestic.replace(',', ''))
international = int(international.replace(',', ''))
mydict['courier'] = car
mydict['airport'] = airp
mydict['year'] = year
mydict['month'] = Month
mydict['flights'] = (domestic, international)
data.append(mydict.copy())
#print type(domestic)
#print mydict
print data
return data
def test():
print "Running a simple test..."
open_zip(datadir)
files = process_all(datadir)
data = []
for f in files:
data += process_file(f)
assert len(data) == 399
for entry in data[:3]:
assert type(entry["year"]) == int
assert type(entry["month"]) == int
assert type(entry["flights"]["domestic"]) == int
assert len(entry["airport"]) == 3
assert len(entry["courier"]) == 2
assert data[-1]["airport"] == "ATL"
assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}
print "... success!"
The error message I get is:
Traceback (most recent call last):
File "vm_main.py", line 33, in <module>
import main
File "/tmp/vmuser_elbzlfkcpw/main.py", line 2, in <module>
import studentMain
File "/tmp/vmuser_elbzlfkcpw/studentMain.py", line 2, in <module>
process.test()
File "/tmp/vmuser_elbzlfkcpw/process.py", line 114, in test
assert type(entry["flights"]["domestic"]) == int
TypeError: tuple indices must be integers, not str
I am a total beginner, I checked both the type of domestic, and international, they are both int.
Can anybody tell me where I can look up or what I did wrong?
You created a tuple here:
mydict['flights'] = (domestic, international)
so mydict['flights'] is a tuple. But you try to treat it as a dictionary here:
assert type(entry["flights"]["domestic"]) == int
That won't work; you'll need to use integer indices here:
assert type(entry["flights"][0]) == int
or better still, use isinstance() to test for types:
assert isinstance(entry["flights"][0], int)
Here you assign your data mydict['flights'] as a tuple.
def process_file(f):
# Omitted code...
mydict['flights'] = (domestic, international)
Your error then comes from an illegal access to that data type. You are attempting to access the first item of that tuple by the name of variable you used in assignment:
assert type(entry["flights"]["domestic"]) == int
You either need to access your data via an integer index:
assert type(entry["flights"][0]) == int
Or you need to change your assignment to:
mydict['flights'] = {"domestic":domestic, "international":international}
tuples are immutable data types which are indexed by integers. The type of access you are attempting is typical of a dictionary, where indexes can be of any type.
Related
I'm writing a program that gives me how much I spent on the month, I need to sum all the expenses that I had I'm tring to use the sum() function in python but it`s giving me this error :
Traceback (most recent call last):
File "C:/Users/lucas/Desktop/Projeto automacao financeira/pdf.py", line 82, in <module>
a.analyzePdf("Document.pdf")
File "C:/Users/lucas/Desktop/Projeto automacao financeira/pdf.py", line 61, in analyzePdf
print("Sum of elements in given list is :", sum(object))
TypeError: 'float' object is not iterable
What should I do? I already converted from str to float, but I still dont the expected result, heres the code:
class BankAccountReport():
def __init__(self):
self.IV_total_spent = 0.0
self.IV_current_funds = 0.0
self.IV_expenses = {"Day of the month": {}, "Business Expenses": {}, "Transaction ID": {}, "location": "", "cost": 0.0,"personal": ""}
self.IV_deposits = {"Day of the month": {}, "Transaction ID": {}, "location": "", "cost": 0.0}
self.IV_expense_filters = {"Business": {}, "Random": {}, "Search_string": "","Search_type": 0, "Personal": {}}
def analyzePdf(self, pdf_file=None):
call_result = {}
debug_data = []
return_msg = "BankAccountReport:analyzePdf"
date = ""
if type(pdf_file) != str:
return_msg += "input validation failed: pdf_file must be an string that sets where the document is"
return {{'success': RC.input_validation_failed, 'return_msg': return_msg, 'debug_data': debug_data}}
if type(date) != str:
return_msg += "You shouldn`t put "" in the date you are setting to"
return {{'success': RC.input_validation_failed, 'return_msg': return_msg, 'debug_data': debug_data}}
df = wrapper.read_pdf(pdf_file, pages= "all", output_format="csv")
df = df.fillna("0")
df.to_csv("output.csv")
df = pd.read_csv("output.csv", usecols=['Data', "Docto.", "Débito (R$)", "Saldo (R$)"])
df.to_csv("output2.csv")
#print(df)
extract = pd.read_csv('output2.csv')
date = input("what date do you want?")
date_filter = (extract["Data"] == date)
filtered_dates = extract[date_filter]
#print(filtered_dates.head())
data = pd.read_csv("output2.csv", usecols= ["Saldo (R$)"] )
data1 = pd.read_csv("output2.csv", usecols=['Débito (R$)'])
print(data)
names = data.values.tolist()
names1= data1.values.tolist()
print(names1)
print(names)
numbers = df['Saldo (R$)'].str.split().str[1].str.replace(',', '.').apply(float)
numbers1 = df["Débito (R$)"].str.split().str[1].str.replace(",", ".").apply(float)
numbers = numbers.fillna("0")
numbers1 = numbers.fillna("0")
for object in numbers1:
print(object)
print("Sum of elements in given list is :", sum(object))
There is a lot of redundant information, and also a lot of missing information in your question. In particular, we do not know the data types of the columns in numbers1. I guess you want to add all rows, and the items are originally represented as strings. Convert them to floats and sum up:
numbers1.astype(float).sum()
You are probably getting this error because df['Saldo (R$)'].str will not return pandas series.
Probably you need to write your own function such as
def toNumber(str):
return float(str.split().str[1].str.replace(',', '.'))
and in the code you can write
numbers = df['Saldo (R$)'].apply(toNumber)
Two strings. My items name:
Parfume name EDT 50ml
And competitor's items name:
Parfume another name EDP 60ml
And i have a long list of these names in one column, competitors names in other column, and I want to leave only those rows in dataframe, that have same amount of ml in both my and competitors names no matter what everything else in these strings look like. So how do I find a substring ending with 'ml' in a bigger string? I could simply do
"**ml" in competitors_name
to see if they both contain the same amount of ml.
Thank you
UPDATE
'ml' is not always at the end of string. It might look like this
Parfume yet another great name 60ml EDP
Try this:
import re
def same_measurement(my_item, competitor_item, unit="ml"):
matcher = re.compile(r".*?(\d+){}".format(unit))
my_match = matcher.match(my_item)
competitor_match = matcher.match(competitor_item)
return my_match and competitor_match and my_match.group(1) == competitor_match.group(1)
my_item = "Parfume name EDT 50ml"
competitor_item = "Parfume another name EDP 50ml"
assert same_measurement(my_item, competitor_item)
my_item = "Parfume name EDT 50ml"
competitor_item = "Parfume another name EDP 60ml"
assert not same_measurement(my_item, competitor_item)
You could use the python Regex library to select the 'xxml' values for each of your data rows and then do some logic to check if they match.
import re
data_rows = [["Parfume name EDT", "Parfume another name EDP 50ml"]]
for data_pairs in data_rows:
my_ml = None
comp_ml = None
# Check for my ml matches and set value
my_ml_matches = re.search(r'(\d{1,3}[Mm][Ll])', data_pairs[0])
if my_ml_matches != None:
my_ml = my_ml_matches[0]
else:
print("my_ml has no ml")
# Check for comp ml matches and set value
comp_ml_matches = re.search(r'(\d{1,3}[Mm][Ll])', data_pairs[1])
if comp_ml_matches != None:
comp_ml = comp_ml_matches[0]
else:
print("comp_ml has no ml")
# Print outputs
if (my_ml != None) and (comp_ml != None):
if my_ml == comp_ml:
print("my_ml: {0} == comp_ml: {1}".format(my_ml, comp_ml))
else:
print("my_ml: {0} != comp_ml: {1}".format(my_ml, comp_ml))
Where data_rows = each row in the data set
Where data_pairs = {your_item_name, competitor_item_name}
You could use a lambda function to do that.
import pandas as pd
import re
d = {
'Us':
['Parfume one 50ml', 'Parfume two 100ml'],
'Competitor':
['Parfume uno 50ml', 'Parfume dos 200ml']
}
df = pd.DataFrame(data=d)
df['Eq'] = df.apply(lambda x : 'Yes' if re.search(r'(\d+)ml', x['Us']).group(1) == re.search(r'(\d+)ml', x['Competitor']).group(1) else "No", axis = 1)
Result:
Doesn't matter whether 'ml' is in the end of in the middle of the string.
I created a python app to parse a json API.
There is 3 endpoints and 1 of these worries me.
The endpoint is : http://coinmarketcap.northpole.ro/history.json?coin=PCN
My code :
def getHistory(self, coin):
endpoint = "history.json?year=2017&coin=PCN"
data = urllib2.urlopen(self.url + endpoint).read()
data = json.loads(data)['history']
return data
def getOrder(self):
for c in self.getCoinsList():
res = []
symbol = c['symbol']
price = self.getCoinPrice(symbol)
count = 0
count_days = len(self.getHistory(symbol))
for h in self.getHistory(symbol):
if h['price']['usd'] > price:
++count
percent_down = count_days / count * 100
line = {'symbol': symbol, 'price': price, 'percent_down': percent_down}
res.append(line)
return res
When I try to get the h['price']['usd'] I have this :
File "coinmarketcap.py", line 39, in getOrder
if h['price']['usd'] > price:
TypeError: string indices must be integers
When I do print type(h) it return unicode.
getHistory returns a dict, and when you iterate over it like this:
for h in self.getHistory(symbol):
you're iterating over the dict keys, not values.
To iterate over the value instead, use
for h in self.getHistory(symbol).values(): # .itervalues() in python2
#Pixel, I think you are assuming that for h in self.getHistory(symbol): returns the value of the key, which is incorrect, it returns the key.
Try saving the dictionary and fetch by key mapping, like this,
json_data = self.getHistory(symbol)
for h in json_data:
if json_data[h]['price']['usd'] > price:
++count
or retrieve the values from the dictionary values, using
for h in self.getHistory(symbol).values():
if h['price']['usd'] > price:
++count
I have a JSON array shown below.
[
"3D3iAR9M4HDETajfD79gs9BM8qhMSq5izX",
"35xfg4UnpEJeHDo55HNwJbr1V3G1ddCuVA"
]
I would like to add a value in the form of the string (self.tx_amount_5) so I get a JSON OBJECT something like this:
{
"3D3iAR9M4HDETajfD79gs9BM8qhMSq5izX" : 100000
"35xfg4UnpEJeHDo55HNwJbr1V3G1ddCuVA" : 100000
}
The part of code that has generated the first JSON array is:
r = requests.get('http://api.blockcypher.com/v1/btc/main/addrs/A/balance')
balance = r.json()['balance']
with open("Entries#x1.csv") as f,open("winningnumbers.csv") as nums:
nums = set(imap(str.rstrip, nums))
r = csv.reader(f)
results = defaultdict(list)
for row in r:
results[sum(n in nums for n in islice(row, 1, None))].append(row[0])
self.number_matched_0 = results[0]
self.number_matched_1 = results[1]
self.number_matched_2 = results[2]
self.number_matched_3 = results[3]
self.number_matched_4 = results[4]
self.number_matched_5 = results[5]
self.number_matched_5_json = json.dumps(self.number_matched_5, sort_keys = True, indent = 4)
print(self.number_matched_5_json)
if len(self.number_matched_3) == 0:
print('Nobody matched 3 numbers')
else:
self.tx_amount_3 = int((balance*0.001)/ len(self.number_matched_3))
if len(self.number_matched_4) == 0:
print('Nobody matched 4 numbers')
else:
self.tx_amount_4 = int((balance*0.1)/ len(self.number_matched_4))
if len(self.number_matched_5) == 0:
print('Nobody matched 3 numbers')
else:
self.tx_amount_5 = int((balance*0.4)/ len(self.number_matched_5))
If I understand correctly, you can create the dictionary like this:
import json
s="""[
"3D3iAR9M4HDETajfD79gs9BM8qhMSq5izX",
"35xfg4UnpEJeHDo55HNwJbr1V3G1ddCuVA"
]"""
d = {el: self.tx_amount_5 for el in json.loads(s)}
print(d)
which produces
{'3D3iAR9M4HDETajfD79gs9BM8qhMSq5izX': 100000,
'35xfg4UnpEJeHDo55HNwJbr1V3G1ddCuVA': 100000}
I am trying to write a script to generate data. I am using random package for this. I execute the script and everything works fine. But when I check through the results, I found out that the script fails to generate the last 100+ rows for some reason.
Can someone suggest me why this could be happening?
from __future__ import print_function
from faker import Faker;
import random;
## Vaue declaration
population = 3;
product = 3;
years = 3;
months = 13;
days = 30;
tax= 3.5;
## Define Column Header
Column_Names = "Population_ID",";","Product_Name",";","Product_ID",";","Year",";",
"Month",";","Day","Quantity_sold",";","Sales_Price",";","Discount",
";","Actual_Sales_Price",tax;
## Function to generate sales related information
def sales_data():
for x in range(0,1):
quantity_sold = random.randint(5,20);
discount = random.choice(range(5,11));
sales_price = random.uniform(20,30);
return quantity_sold,round(sales_price,2),discount,round((sales_price)-(sales_price*discount)+(sales_price*tax));
## Format the month to quarter and return the value
def quarter(month):
if month >= 1 and month <= 3:
return "Q1";
elif month > 3 and month <= 6:
return "Q2";
elif month > 6 and month <= 9:
return "Q3";
else:
return "Q4";
## Generate product_id
def product_name():
str2 = "PROD";
sample2 = random.sample([1,2,3,4,5,6,7,8,9],5);
string_list = [];
for x in sample2:
string_list.append(str(x));
return (str2+''.join(string_list));
### Main starts here ###
result_log = open("C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv",'w')
print (Column_Names, result_log);
### Loop and Generate Data ###
for pop in range(0,population):
pop = random.randint(55000,85000);
for prod_id in range(0,product):
product_name2 = product_name();
for year in range(1,years):
for month in range(1,months):
for day in range(1,31):
a = sales_data();
rows = str(pop)+";"+product_name2+";"+str(prod_id)+";"+str(year)+";"+str(month)+";"+quarter(month)+";"+str(day)+";"+str(a[0])+";"+str(a[1])+";"+str(a[2])+";"+str(tax)+";"+str(a[3]);
print(rows,file=result_log);
#print (rows);
tax = tax+1;
You need to close a file to have the buffers flushed:
result_log.close()
Better still, use the file object as a context manager and have the with statement close it for you when the block exits:
filename = "C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv"
with result_log = open(filename, 'w'):
# code writing to result_log
Rather than manually writing strings with delimiters in between, you should really use the csv module:
import csv
# ..
column_names = (
"Population_ID", "Product_Name", "Product_ID", "Year",
"Month", "Day", "Quantity_sold", "Sales_Price", "Discount",
"Actual_Sales_Price", tax)
# ..
with result_log = open(filename, 'wb'):
writer = csv.writer(result_log, delimiter=';')
writer.writerow(column_names)
# looping
row = [pop, product_name2, prod_id, year, month, quarter(month), day,
a[0], a[1], a[2], tax, a[3]]
writer.writerow(row)