I wrote a code to convert PDF to CSV, read the CSV file, and export only relevant information from the CSV file. The function is supposed to return filtered information such as english_name: 'someones name', original_language_name: 'someones name' etc, but instead the command returned english_name: '', original_language_name: '' etc. Below is the code that I wrote:
import pandas as pd
import tabula
from pandas import DataFrame
from backend.classes import Shareholder, Officer
from typing import List
def strip_string(string):
return str(string).strip()
def get_float_without_thousands_separator(string, thousands_separator):
return float(string.replace(thousands_separator, ''))
def extract_officers_and_shareholders_lists_from_df(df, total_number_of_shares, no_data_placeholder, number_of_shares, thousands_separator):
officers = []
shareholders = []
NAME = 'Nama'
POSITION = 'Jabatan'
for row in range((df.shape[0])):
if str(df[POSITION][row]).strip() != no_data_placeholder:
original_language_name = strip_string(df[NAME][row])
english_name = strip_string(df[NAME][row])
position = strip_string(df[POSITION][row])
officer = Officer(english_name=english_name, original_language_name=original_language_name, position=position)
elif str(df[number_of_shares][row]).strip() != no_data_placeholder:
original_language_name = strip_string(df[NAME][row])
english_name = strip_string(df[NAME][row])
number_of_shares_string = strip_string(df[number_of_shares][row])
number_of_shares_number = get_float_without_thousands_separator(number_of_shares_string, thousands_separator)
shareholding_percentage = (number_of_shares_number / total_number_of_shares) * 100
shareholder = Shareholder(english_name=english_name, original_language_name=original_language_name, shareholding_percentage=shareholding_percentage)
return officers, shareholders
def get_officers_and_shareholders_lists(pdf_input_file):
NUMBER_OF_SHARES = 'Jumlah Lembar Saham'
output_file_path = 'CSV/Officers_and_Shareholders.csv'
tabula.convert_into(pdf_input_file, output_file_path, output_format='csv', pages='all')
df = pd.read_csv(output_file_path, header=3, on_bad_lines='skip')
all_shares = df[NUMBER_OF_SHARES].to_list()
all_shares_strings = [strip_string(shares) for shares in all_shares if strip_string(shares) != NO_DATA_PLACEHOLDER]
all_shares_numbers = [get_float_without_thousands_separator(shares, THOUSANDS_SEPARATOR) for shares in all_shares_strings]
total_number_of_shares = sum(all_shares_numbers)
return extract_officers_and_shareholders_lists_from_df(
The command call that I use for the codes on the above is python3 -m backend.officers_and_shareholders. Is there a method to pass in so that english_name returns a name, original_language_name returns a name?
recent_cases is supposed to sum the new covid cases in last 10 days for a given location
somehow my code prints None. i cant find the problem
import json
import pandas as pd
import plotly.express as ex
from datetime import *
from datetime import timedelta
class Covid:
dt = timedelta(days=1)
ten_days = timedelta(days=10)
covid_data = pd.read_excel("owid-covid-data.xlsx", usecols="C:F,H,I")
def recent_cases(self, cntry):
today = datetime.today()
temp = today - self.ten_days # 10 days before today
sum_of_cases = 0
for ind in self.covid_data.index:
if temp <= today:
if (self.covid_data["date"][ind] == temp) and (self.covid_data['location'][ind] == cntry):
# if 'date' is temp and 'location' is the location input, sum new cases
sum_of_cases = sum_of_cases + int(self.covid_data["new_cases"][ind])
temp = temp + self.dt # move to the next day
else: # if temp passed today, all past ten days cases are summed
if __name__ == '__main__':
c = Covid()
I am trying to implement a "user-friendly" portfolio optimization program in Python.
Since I am still a beginner I did not quite manage to realize it.
The only thing the program should use as input are the stock codes.
I tried to create a mwe below:
import numpy as np
import yfinance as yf
import pandas as pd
def daily_returns(price):
price = price.to_numpy()
shift_1 = price[1:]
shift_2 = price[:-1]
return (shift_1 - shift_2)/shift_1
def annual_returns(price):
price = price.to_numpy()
start = price[0]
end = price[len(price)-1]
return (end-start)/start
def adjusting(price):
adj = len(price)
diff = adj - adjvalue
if diff != 0:
price_new = price[:-diff]
else: price_new = price
return price_new
#Minimal Reproducible Example
#getting user input
names = input('Stock codes:')
names = names.split()
a = len(names)
msft = yf.Ticker(names[0])
aapl = yf.Ticker(names[1])
#import data
hist_msft = msft.history(interval='1d',start='2020-01-01',end='2020-12-31')
hist_msft = pd.DataFrame(hist_msft,columns=['Close'])
#hist_msft = hist_msft.to_numpy()
hist_aapl = aapl.history(interval='1d',start='2020-01-01',end='2020-12-31')
hist_aapl = pd.DataFrame(hist_aapl,columns=['Close'])
#hist_aapl = hist_aapl.to_numpy()
#daily returns
aapl_daily_returns = daily_returns(hist_aapl)
aapl_daily_returns = np.ravel(aapl_daily_returns)
msft_daily_returns = daily_returns(hist_msft)
msft_daily_returns = np.ravel(msft_daily_returns)
#adjusting for different trading periods
adjvalue = min(len(aapl_daily_returns),len(msft_daily_returns))
aapl_adj = adjusting(aapl_daily_returns)
msft_adj = adjusting(msft_daily_returns)
#annual returns
aapl_ann_returns = annual_returns(hist_aapl)
msft_ann_returns = annual_returns(hist_msft)
#inputs for optimization
cov_mat = np.cov([aapl_adj,msft_adj])*252
ann_returns = np.concatenate((aapl_ann_returns,msft_ann_returns))
Now I just want the code to work with a various, unknown number of inputs. I tried reading a lot about global variables or tried to figure it out with dictionaries but couldn't really achieve any progress.
I think using the for loop can solve your problem!
names = input('Stock codes:')
names = names.split()
for name in names:
#analyze here
#I don't know anything about stocks so I wont write anything here
As you can see in the title, I want to find the time of given city in Python. How can I achieve this? I've tried geopy and timezonefinder modules but they are giving me different results too. (like 'What time is it in Spotify?', 'It's 12:04')
What I'm trying to achieve is:
What time is it in California?
It's 16:15
import nltk
import datetime
import calendar
import pytz
from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder
self.inp = input("City name: ")
# Find city name using NLP
# Get city name
findCityName = str(self.inp.title())
word = nltk.word_tokenize(findCityName)
pos_tag = nltk.pos_tag(word)
chunk = nltk.ne_chunk(pos_tag)
self.inp = [ " ".join(w for w, t in ele) for ele in chunk if isinstance(ele, nltk.Tree)]
self.inp = ' '.join(self.inp)
# Get lat, long from city name
geolocator = Nominatim(user_agent='xxx')
location = geolocator.geocode(self.inp.capitalize())
# Get timezone from coordinates
tf = TimezoneFinder()
latitude, longitude = location.latitude, location.longitude
# Timezone
datez = tf.timezone_at(lng=longitude, lat=latitude)
datez = str(datez)
globalDate = datetime.datetime.now(pytz.timezone(datez))
print("The date in " + str(self.inp) + " is: " + globalDate.strftime('%A, %m/%d/%y'))
I am trying to write a script to generate data. I am using random package for this. I execute the script and everything works fine. But when I check through the results, I found out that the script fails to generate the last 100+ rows for some reason.
Can someone suggest me why this could be happening?
from __future__ import print_function
from faker import Faker;
import random;
## Vaue declaration
population = 3;
product = 3;
years = 3;
months = 13;
days = 30;
tax= 3.5;
## Define Column Header
Column_Names = "Population_ID",";","Product_Name",";","Product_ID",";","Year",";",
## Function to generate sales related information
def sales_data():
for x in range(0,1):
quantity_sold = random.randint(5,20);
discount = random.choice(range(5,11));
sales_price = random.uniform(20,30);
return quantity_sold,round(sales_price,2),discount,round((sales_price)-(sales_price*discount)+(sales_price*tax));
## Format the month to quarter and return the value
def quarter(month):
if month >= 1 and month <= 3:
return "Q1";
elif month > 3 and month <= 6:
return "Q2";
elif month > 6 and month <= 9:
return "Q3";
return "Q4";
## Generate product_id
def product_name():
str2 = "PROD";
sample2 = random.sample([1,2,3,4,5,6,7,8,9],5);
string_list = [];
for x in sample2:
return (str2+''.join(string_list));
### Main starts here ###
result_log = open("C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv",'w')
print (Column_Names, result_log);
### Loop and Generate Data ###
for pop in range(0,population):
pop = random.randint(55000,85000);
for prod_id in range(0,product):
product_name2 = product_name();
for year in range(1,years):
for month in range(1,months):
for day in range(1,31):
a = sales_data();
rows = str(pop)+";"+product_name2+";"+str(prod_id)+";"+str(year)+";"+str(month)+";"+quarter(month)+";"+str(day)+";"+str(a[0])+";"+str(a[1])+";"+str(a[2])+";"+str(tax)+";"+str(a[3]);
#print (rows);
tax = tax+1;
You need to close a file to have the buffers flushed:
Better still, use the file object as a context manager and have the with statement close it for you when the block exits:
filename = "C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv"
with result_log = open(filename, 'w'):
# code writing to result_log
Rather than manually writing strings with delimiters in between, you should really use the csv module:
import csv
# ..
column_names = (
"Population_ID", "Product_Name", "Product_ID", "Year",
"Month", "Day", "Quantity_sold", "Sales_Price", "Discount",
"Actual_Sales_Price", tax)
# ..
with result_log = open(filename, 'wb'):
writer = csv.writer(result_log, delimiter=';')
# looping
row = [pop, product_name2, prod_id, year, month, quarter(month), day,
a[0], a[1], a[2], tax, a[3]]