I am trying to update a rating row by row. I have one dataframe of players, that all start with the same rating. For each match, I want the rating to change. Another dataframe contains results of each match.
import pandas as pd
gamesdata = [['paul','tom'],['paul','lisa'],['tom','paul'],['lisa','tom'],['paul','lisa'],['lisa','tom'],['paul','tom']]
games = pd.DataFrame(gamesdata, columns = ['Winner', 'Looser'])
playersdata= ['lisa','paul','tom']
players = pd.DataFrame(playersdata, columns = ['Name'])
mean_elo = 1000
elo_width = 400
k_factor = 64
players['elo'] = mean_elo
def update_elo(winner_elo, loser_elo):
expected_win = expected_result(winner_elo, loser_elo)
change_in_elo = k_factor * (1-expected_win)
winner_elo += change_in_elo
loser_elo -= change_in_elo
return winner_elo, loser_elo
def expected_result(elo_a, elo_b):
expect_a = 1.0/(1+10**((elo_b - elo_a)/elo_width))
return expect_a
for index, row in games.iterrows():
winnername = row['Winner']
losername = row['Looser']
web = players['elo'].loc[players['Name'] == winnername].values[0]
wIndex = players.loc[players['Name'] == winnername]
#I want to return just the index, so I can update the value
print(wIndex)
leb = players['elo'].loc[players['Name'] == losername].values[0]
print('Winner Elo before: ' + str(web))
winner_elo, looser_elo = update_elo(web, leb)
print('Winner Elo after: ' + str(winner_elo))
#here I want to update value
#players.at[wIndex,'elo']=winner_elo
I am trying to update the value in the players table using
players.at[wIndex,'elo']=winner_elo
but i struggle to get the index with this code:
wIndex = players.loc[players['Name'] == winnername]
Found a sollution:
wIndex = players.loc[players['Name'] == winnername].index.values
Can't believe i missed that
Related
I've written a python program that takes some inputs and turns them into a matplotlib graph. Specifically, it displays wealth distributions by percentile for a country of the user's choosing. However, these inputs are currently given by changing variables in the program.
I want to put this code on a website, allowing users to choose any country and see the wealth distribution for that country, as well as how they compare. Essentially, I am trying to recreate this: https://wid.world/income-comparator/
The code in python is all done but I am struggling to incorporate it into an HTML file. I was trying to use pyscript but it currently loads forever and displays nothing. Would rather not rewrite it in javascript (mainly because I don't know js). My thoughts are that it has something to do with the code importing csv files from my device?
import csv
from typing import List
import matplotlib.pyplot as plt
import collections
import math
from forex_python.converter import CurrencyRates
# ---------------- #
# whether or not the graph includes the top 1 percent in the graph (makes the rest of the graph visible!)
one_percent = False # True or False
# pick which country(ies) you want to view
country = 'China' # String
# what currency should the graph use
currency_used = 'Canada' # String
# if you want to compare an income
compare_income = True # True or False
# what income do you want to compare
income = 100000 # Int
# ---------------- #
codes = {}
# get dictionary of monetary country codes
monetary_codes = {}
with open('codes-all.csv') as csv_file:
list = csv.reader(csv_file, delimiter=',')
for row in list:
if row[5] == "":
monetary_codes[row[0]] = (row[2], row[1])
# get dictionary of country names and codes for WID
with open('WID_countries.csv') as csv_file:
WID_codes = csv.reader(csv_file, delimiter=',')
next(WID_codes)
for row in WID_codes:
if len(row[0]) == 2:
if row[2] != "":
monetary_code = monetary_codes[row[1].upper()][0]
currency_name = monetary_codes[row[1].upper()][1]
codes[row[1].upper()] = (row[0], monetary_code, currency_name)
elif row[2] == "":
codes[row[1].upper()] = (row[0], 'USD', 'United States Dollar')
elif row[0][0] == 'U' and row[0][1] == 'S':
codes[row[1].upper()] = (row[0], 'USD', 'United States Dollar')
# converts user input to upper case
country = country.upper()
currency_used = currency_used.upper()
# gets conversion rate
c = CurrencyRates()
conversion_rate = c.get_rate(codes[country][1], codes[currency_used][1])
# convert money into correct currency
def convert_money(conversion_rate, value):
return float(value) * conversion_rate
# get and clean data
def get_data(country):
aptinc = {}
# cleaning the data
with open(f'country_data/WID_data_{codes[country][0]}.csv') as csv_file:
data = csv.reader(csv_file, delimiter=';')
for row in data:
# I only care about the year 2021 and the variable 'aptinc'
if 'aptinc992' in row[1] and row[3] == '2021':
# translates percentile string into a numerical value
index = 0
for i in row[2]:
# index 0 is always 'p', so we get rid of that
if index == 0:
row[2] = row[2][1:]
# each string has a p in the middle of the numbers we care about. I also only
# care about the rows which measure a single percentile
# (upper bound - lower bound <= 1)
elif i == 'p':
lb = float(row[2][:index - 1])
ub = float(row[2][index:])
# if the top one percent is being filtered out adds another requirement
if not one_percent:
if ub - lb <= 1 and ub <= 99:
row[2] = ub
else:
row[2] = 0
else:
if ub - lb <= 1:
row[2] = ub
else: row[2] = 0
index += 1
# adds wanted, cleaned data to a dictionary. Also converts all values to one currency
if row[2] != 0:
aptinc[row[2]] = convert_money(conversion_rate, row[4])
return aptinc
# find the closest percentile to an income
def closest_percentile(income, data):
closest = math.inf
percentile = float()
for i in data:
difference = income - data[i]
if abs(difference) < closest:
closest = difference
percentile = i
return percentile
# ---------------- #
unsorted_data = {}
percentiles = []
average_income = []
# gets data for the country
data = get_data(country)
for i in data:
unsorted_data[i] = data[i]
# sorts the data
sorted = collections.OrderedDict(sorted(unsorted_data.items()))
for i in sorted:
percentiles.append(i)
average_income.append(data[i])
# makes countries pretty for printing
country = country.lower()
country = country.capitalize()
# calculates where the income places against incomes from country(ies)
blurb = ""
if compare_income:
percentile = closest_percentile(income, sorted)
blurb = f"You are richer than {round(percentile)} percent of {country}'s population"
# plot this data!
plt.plot(percentiles,average_income)
plt.title(f'{country} Average Annual Income by Percentile')
plt.xlabel(f'Percentile\n{blurb}')
plt.ylabel(f'Average Annual Income of {country}({codes[currency_used][1]})')
plt.axvline(x = 99, color = 'r', label = '99th percentile', linestyle=':')
if compare_income:
plt.axvline(x = percentile, color = 'g', label = f'{income} {codes[currency_used][2]}')
plt.legend(bbox_to_anchor = (0, 1), loc = 'upper left')
plt.show()
So I have a Python code that first aggregates and standardizes Data into a file I called "tripFile". Then the code tries to identify the differences between this most recent tripFile and a previous one.
From the first part of the code, if I export the tripFile, and import it again for the second part of the code, it takes around 5 minutes to run and says it is looping over a bit more than 4,000 objects.
newTripFile = pd.read_csv(PATH + today + ' Trip File v6.csv')
However, if I do not export & re-import the Data (just keeping it from the first part of the code), it takes a bit less than 24 hours (!!) and says it is looping over a bit more than 951,691 objects.
newTripFile = tripFile
My Data is a dataframe, and checked the shape of it, it is identical to the file I export.
Any idea what can be causing that ???
Here is the second part of my code:
oldTripFile = pd.read_excel(PATH + OLDTRIPFILE)
oldTripFile.drop(['id'], axis = 1, inplace = True)
oldTripFile['status'] = 'old'
# New version of trip file
newTripFile = pd.read_csv(PATH + today + ' Trip File v6.csv')
newTripFile.drop(['id'], axis = 1, inplace = True)
newTripFile['status'] = 'new'
db_trips = pd.concat([oldTripFile, newTripFile]) #concatenation of the two dataframes
db_trips = db_trips.reset_index(drop = True)
db_trips.drop_duplicates(keep = False, subset = [column for column in db_trips.columns[:-1] ], inplace = True)
db_trips = db_trips.reset_index(drop = True)
db_trips.head()
update_details = []
# Get the duplicates : only consider ['fromCode', 'toCode', 'mode'] for identifying duplicates
# Create a dataframe that contains only the trips that was deleted and was recently added
db_trips_delete_new = db_trips.drop_duplicates(keep = False, subset = ['fromCode', 'toCode', 'mode'])
db_trips_delete_new = db_trips_delete_new.reset_index(drop = True)
# New trips
new_trips = db_trips_delete_new[db_trips_delete_new['status'] == 'new'].values.tolist()
for trip in new_trips:
trip.append('new trip added')
update_details = update_details + new_trips
# Deleted trips
old_trips = db_trips_delete_new[db_trips_delete_new['status'] == 'old'].values.tolist()
for trip in old_trips:
trip.append('trip deleted')
update_details = update_details + old_trips
db_trips_delete_new.head()
# Updated trips
# Ocean: no need to check the transit time column
sea_trips = db_trips.loc[db_trips['mode'].isin(['sea', 'cfs'])]
sea_trips = sea_trips.reset_index(drop = True)
list_trips_sea_update = sea_trips[sea_trips.duplicated(subset = ['fromCode', 'toCode', 'mode'], keep = False)].values.tolist()
if len(list_trips_sea_update) != 0:
for i in tqdm(range(0, len(list_trips_sea_update) - 1)):
for j in range(i + 1, len(list_trips_sea_update)):
if list_trips_sea_update[i][2] == list_trips_sea_update[j][2] and list_trips_sea_update[i][9] == list_trips_sea_update[j][9] and list_trips_sea_update[i][14] == list_trips_sea_update[j][14]:
update_comment = ''
# Check display from / to
if list_trips_sea_update[i][5] != list_trips_sea_update[j][5]:
update_comment = update_comment + 'fromDisplayLocation was updated.'
if list_trips_sea_update[i][12] != list_trips_sea_update[j][12]:
update_comment = update_comment + 'toDisplayLocation was updated.'
# Get the updated trip (the row with status new)
if list_trips_sea_update[i][17] == 'new' and list_trips_sea_update[j][17] != 'new' :
list_trips_sea_update[i].append(update_comment)
update_details = update_details + [list_trips_sea_update[i]]
else:
if list_trips_sea_update[j][17] == 'new' and list_trips_sea_update[i][17] != 'new':
list_trips_sea_update[j].append(update_comment)
update_details = update_details + [list_trips_sea_update[j]]
else:
print('excel files are not organized')
# Ground: transit time column need to be checked
ground_trips = db_trips[~db_trips['mode'].isin(['sea', 'cfs'])]
ground_trips = ground_trips.reset_index(drop = True)
list_trips_ground_update = ground_trips[ground_trips.duplicated(subset = ['fromCode', 'toCode', 'mode'], keep = False)].values.tolist()
if len(list_trips_ground_update) != 0:
for i in tqdm(range(0, len(list_trips_ground_update) - 1)):
for j in range(i + 1, len(list_trips_ground_update)):
if list_trips_ground_update[i][2] == list_trips_ground_update[j][2] and list_trips_ground_update[i][9] == list_trips_ground_update[j][9] and list_trips_ground_update[i][14] == list_trips_ground_update[j][14]:
update_comment = ''
# Check display from / to
if list_trips_ground_update[i][5] != list_trips_ground_update[j][5]:
update_comment = update_comment + 'fromDisplayLocation was updated.'
if list_trips_ground_update[i][12] != list_trips_ground_update[j][12]:
update_comment = update_comment + 'toDisplayLocation was updated.'
# Check transit time
if list_trips_ground_update[i][15] != list_trips_ground_update[j][15]:
update_comment = update_comment + 'transit time was updated.'
# Get the updated trip (the row with status new)
if list_trips_ground_update[i][17] == 'new' and list_trips_ground_update[j][17] != 'new' :
list_trips_ground_update[i].append(update_comment)
update_details=update_details + [list_trips_ground_update[i]]
else:
if list_trips_ground_update[j][17] == 'new' and list_trips_ground_update[i][17] != 'new':
list_trips_ground_update[j].append(update_comment)
update_details = update_details + [list_trips_ground_update[j]]
else:
print('excel files are not organized')
And here an example of what my trip file looks like:
Any help is appreciated :)
If ever it can be useful to someone else, issue was coming from the type. When keeping my tripFile in memory, one of my column was "10.0" for example, whereas when imported this column was "10".
As I'm comparing with another imported tripFile, if both files are imported the column in both files are of same type, but if one of the files is kept in memory the column is of different type in both files and considered as updated. As such takes much longer when kept in memory as every row is considered updated.
I'm new in python and sqlalchemy.
I already have a delete method working if I construct the where conditions by hand.
Now, I need to read the columns and values from an enter request in yaml format and create the where conditions.
#enter data as yaml
items:
- item:
table: [MyTable,OtherTable]
filters:
field_id: 1234
#other_id: null
Here is what I try and can't go ahead:
for i in use_case_cfg['items']:
item = i.get('item')
for t in item['table']:
if item['filters']:
filters = item['filters']
where_conditions = ''
count = 0
for column, value in filters.items():
aux = str(getattr(t, column) == bindparam(value))
if count == 0:
where_conditions += aux
else:
where_conditions += ', ' + aux
count += 1
to_delete = inv[t].__table__.delete().where(text(where_conditions))
#to_delete = t.__table__.delete().where(getattr(t, column) == value)
else:
to_delete = inv[t].__table__.delete()
CoreData.session.execute(to_delete)
To me, it looks ok, but when I run, I got the error below:
sqlalchemy.exc.StatementError: (sqlalchemy.exc.InvalidRequestError) A value is required for bind parameter '9876'
[SQL: DELETE FROM MyTable WHERE "MyTable".field_id = %(1234)s]
[parameters: [{}]]
(Background on this error at: http://sqlalche.me/e/cd3x)
Can someone explain to me what is wrong or the proper way to do it?
Thanks.
There are two problems with the code.
Firstly,
str(getattr(t, column) == bindparam(value))
is binding the value as a placeholder, so you end up with
WHERE f2 = :Bob
but it should be the name that maps to the value in filters (so the column name in your case), so you end up with
WHERE f2 = :f2
Secondly, multiple WHERE conditions are being joined with a comma, but you should use AND or OR, depending on what you are trying to do.
Given a model Foo:
class Foo(Base):
__tablename__ = 'foo'
id = sa.Column(sa.Integer, primary_key=True)
f1 = sa.Column(sa.Integer)
f2 = sa.Column(sa.String)
Here's a working version of a segment of your code:
filters = {'f1': 2, 'f2': 'Bob'}
t = Foo
where_conditions = ''
count = 0
for column in filters:
aux = str(getattr(t, column) == sa.bindparam(column))
if count == 0:
where_conditions += aux
else:
where_conditions += ' AND ' + aux
count += 1
to_delete = t.__table__.delete().where(sa.text(where_conditions))
print(to_delete)
session.execute(to_delete, filters)
If you aren't obliged to construct the WHERE conditions as strings, you can do it like this:
where_conditions = [(getattr(t, column) == sa.bindparam(column))
for column in filters]
to_delete = t.__table__.delete().where(sa.and_(*where_conditions))
session.execute(to_delete, filters)
I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')
I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)
First of all, I'm not sure whether it is drop_duplicates() fault or not.
What I want to do:
Import file from csv, do a re.search on every row, if match, keep the row inside a dictionary, if doesn't match, keep the row inside another dictionary. Make a graph out of the length of the dictionary value.
The problem
I have 1000 rows inside csv, but the result returns 1200.
My code
import pandas as pd
import re
# import data
filename = 'sample.csv'
# save data as data
data = pd.read_csv(filename, encoding='utf-8')
# create new dictionary for word that is true and false
# but doesn't have the keyword in items
wordNT = {}
wordNF = {}
kaiT = {}
kaiF = {}
# if text is True
def word_in_text(word,text,label):
match = re.search(word,text)
if match and label == True:
kaiT.setdefault('text', []).append(text)
elif match and label == False:
kaiF.setdefault('text', []).append(text)
elif label == True and not match:
wordNT.setdefault('text', []).append(text)
elif label == False and not match:
wordNF.setdefault('text', []).append(text)
# iterate every text in data
for index, row in data.iterrows():
word_in_text('foo', row['text'], row['label'])
word_in_text('bar', row['text'], row['label'])
# make pandas data frame out of dict
wordTDf = pd.DataFrame.from_dict(wordNT)
wordFDf = pd.DataFrame.from_dict(wordNF)
kaiTDf = pd.DataFrame.from_dict(kaiT)
kaiFDf = pd.DataFrame.from_dict(kaiF)
# drop duplicates
wordTDf = wordTDf.drop_duplicates()
wordFDf = wordFDf.drop_duplicates()
kaiTDf = kaiTDf.drop_duplicates()
kaiFDf = kaiFDf.drop_duplicates()
# count how many
wordTrueCount = len(wordTDf.index)
wordFalseCount = len(wordFDf.index)
kaiTrueCount = len(kaiTDf.index)
kaiFalseCount = len(kaiFDf.index)
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount)
When I removed the line
word_in_text('bar', row['text'], row['label'])
and only keep
word_in_text('foo', row['text'], row['label'])
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount) returns 1000 correctly, and vice versa.
But when I don't, it returns 1200 when it should only be 1000?
CSV INPUT sample
text,label
"hey", TRUE
"halo", FALSE
"How are you?", TRUE
EXPECTED OUTPUT
1000
OUTPUT
1200
In the function word_in_text, you update the four dict: wordNT, wordNF, kaiT and kaiF.
And you call word_in_text twice while iterating the dataframe:
# iterate every text in data
for index, row in data.iterrows():
word_in_text('foo', row['text'], row['label'])
word_in_text('bar', row['text'], row['label'])
So the searching result is the mix of the result from 'foo' and result from 'bar'.
Instead, you should clean up the four dict before starting a new search:
def search(text):
wordNT = {}
wordNF = {}
kaiT = {}
kaiF = {}
# iterate every text in data
for index, row in data.iterrows():
word_in_text(text, row['text'], row['label'])
# make pandas data frame out of dict
wordTDf = pd.DataFrame.from_dict(wordNT)
wordFDf = pd.DataFrame.from_dict(wordNF)
kaiTDf = pd.DataFrame.from_dict(kaiT)
kaiFDf = pd.DataFrame.from_dict(kaiF)
# drop duplicates
wordTDf = wordTDf.drop_duplicates()
wordFDf = wordFDf.drop_duplicates()
kaiTDf = kaiTDf.drop_duplicates()
kaiFDf = kaiFDf.drop_duplicates()
# count how many
wordTrueCount = len(wordTDf.index)
wordFalseCount = len(wordFDf.index)
kaiTrueCount = len(kaiTDf.index)
kaiFalseCount = len(kaiFDf.index)
print(wordTrueCount + wordFalseCount + kaiTrueCount + kaiFalseCount)
search('foo')
search('bar')