Sort images based on age - python

I'm using the following code to load the IMDB dataset and mat file then isolate them based on the ages. The following code loads the mat file and then it computes the age by subtracting the data in which the photo was taken from the date of birth
from scipy.io import loadmat
from datetime import datetime
import os
import numpy as np
from shutil import copy
db = "imdb"
mat_path = "data/{}_crop/{}.mat".format(db, db)
print (mat_path)
dataset = loadmat(mat_path)
face_score_treshold = 3
path_root = "data/{}_crop/".format(db)
def calc_age(taken, dob):
birth = datetime.fromordinal(max(int(dob) - 366, 1))
# assume the photo was taken in the middle of the year
if birth.month < 7:
return taken - birth.year
else:
return taken - birth.year - 1
image_names_array = dataset['imdb']['full_path'][0, 0][0]
dob = dataset[db][0, 0]["dob"][0] # Matlab serial date number
photo_taken = dataset[db][0, 0]["photo_taken"][0] # year
image_names = []
age = [calc_age(photo_taken[i], dob[i]) for i in range(len(dob))]
print(age)
for image_name_arg in range(image_names_array.shape[0]):
try:
os.makedirs(str (age[image_name_arg]))
except OSError:
pass
copy(path_root + "/" + str(image_names_array[image_name_arg][0]),str (age[image_name_arg]))
image_name = image_names_array[image_name_arg][0]
image_names.append(image_name)
print( dict(zip(image_names, age)))
After calculating the age, it creates the folder with age as the folder name and it copies the image files to the folder.
However there are lots of false negatives in the resulting folder. How can I be able to sort this out?

Related

Blank Strings Are Returned in Python Dataframe

I wrote a code to convert PDF to CSV, read the CSV file, and export only relevant information from the CSV file. The function is supposed to return filtered information such as english_name: 'someones name', original_language_name: 'someones name' etc, but instead the command returned english_name: '', original_language_name: '' etc. Below is the code that I wrote:
import pandas as pd
import tabula
from pandas import DataFrame
from backend.classes import Shareholder, Officer
from typing import List
def strip_string(string):
return str(string).strip()
def get_float_without_thousands_separator(string, thousands_separator):
return float(string.replace(thousands_separator, ''))
def extract_officers_and_shareholders_lists_from_df(df, total_number_of_shares, no_data_placeholder, number_of_shares, thousands_separator):
officers = []
shareholders = []
NAME = 'Nama'
POSITION = 'Jabatan'
for row in range((df.shape[0])):
if str(df[POSITION][row]).strip() != no_data_placeholder:
original_language_name = strip_string(df[NAME][row])
english_name = strip_string(df[NAME][row])
position = strip_string(df[POSITION][row])
officer = Officer(english_name=english_name, original_language_name=original_language_name, position=position)
officers.append(officer)
elif str(df[number_of_shares][row]).strip() != no_data_placeholder:
original_language_name = strip_string(df[NAME][row])
english_name = strip_string(df[NAME][row])
number_of_shares_string = strip_string(df[number_of_shares][row])
number_of_shares_number = get_float_without_thousands_separator(number_of_shares_string, thousands_separator)
shareholding_percentage = (number_of_shares_number / total_number_of_shares) * 100
shareholder = Shareholder(english_name=english_name, original_language_name=original_language_name, shareholding_percentage=shareholding_percentage)
shareholders.append(shareholder)
return officers, shareholders
def get_officers_and_shareholders_lists(pdf_input_file):
NO_DATA_PLACEHOLDER = '-'
NUMBER_OF_SHARES = 'Jumlah Lembar Saham'
THOUSANDS_SEPARATOR = '.'
output_file_path = 'CSV/Officers_and_Shareholders.csv'
tabula.convert_into(pdf_input_file, output_file_path, output_format='csv', pages='all')
df = pd.read_csv(output_file_path, header=3, on_bad_lines='skip')
all_shares = df[NUMBER_OF_SHARES].to_list()
all_shares_strings = [strip_string(shares) for shares in all_shares if strip_string(shares) != NO_DATA_PLACEHOLDER]
all_shares_numbers = [get_float_without_thousands_separator(shares, THOUSANDS_SEPARATOR) for shares in all_shares_strings]
total_number_of_shares = sum(all_shares_numbers)
return extract_officers_and_shareholders_lists_from_df(
df=df,
total_number_of_shares=total_number_of_shares,
number_of_shares=NUMBER_OF_SHARES,
no_data_placeholder=NO_DATA_PLACEHOLDER,
thousands_separator=THOUSANDS_SEPARATOR)
The command call that I use for the codes on the above is python3 -m backend.officers_and_shareholders. Is there a method to pass in so that english_name returns a name, original_language_name returns a name?

xml with pandas: sum values by condition

recent_cases is supposed to sum the new covid cases in last 10 days for a given location
somehow my code prints None. i cant find the problem
import json
import pandas as pd
import plotly.express as ex
from datetime import *
from datetime import timedelta
class Covid:
dt = timedelta(days=1)
ten_days = timedelta(days=10)
covid_data = pd.read_excel("owid-covid-data.xlsx", usecols="C:F,H,I")
def recent_cases(self, cntry):
today = datetime.today()
temp = today - self.ten_days # 10 days before today
sum_of_cases = 0
for ind in self.covid_data.index:
if temp <= today:
if (self.covid_data["date"][ind] == temp) and (self.covid_data['location'][ind] == cntry):
# if 'date' is temp and 'location' is the location input, sum new cases
sum_of_cases = sum_of_cases + int(self.covid_data["new_cases"][ind])
temp = temp + self.dt # move to the next day
else: # if temp passed today, all past ten days cases are summed
break
if __name__ == '__main__':
c = Covid()
print(c.recent_cases('Italy'))

Using user input as variables in Python

I am trying to implement a "user-friendly" portfolio optimization program in Python.
Since I am still a beginner I did not quite manage to realize it.
The only thing the program should use as input are the stock codes.
I tried to create a mwe below:
import numpy as np
import yfinance as yf
import pandas as pd
def daily_returns(price):
price = price.to_numpy()
shift_1 = price[1:]
shift_2 = price[:-1]
return (shift_1 - shift_2)/shift_1
def annual_returns(price):
price = price.to_numpy()
start = price[0]
end = price[len(price)-1]
return (end-start)/start
def adjusting(price):
adj = len(price)
diff = adj - adjvalue
if diff != 0:
price_new = price[:-diff]
else: price_new = price
return price_new
#Minimal Reproducible Example
#getting user input
names = input('Stock codes:')
names = names.split()
a = len(names)
msft = yf.Ticker(names[0])
aapl = yf.Ticker(names[1])
#import data
hist_msft = msft.history(interval='1d',start='2020-01-01',end='2020-12-31')
hist_msft = pd.DataFrame(hist_msft,columns=['Close'])
#hist_msft = hist_msft.to_numpy()
hist_aapl = aapl.history(interval='1d',start='2020-01-01',end='2020-12-31')
hist_aapl = pd.DataFrame(hist_aapl,columns=['Close'])
#hist_aapl = hist_aapl.to_numpy()
#daily returns
aapl_daily_returns = daily_returns(hist_aapl)
aapl_daily_returns = np.ravel(aapl_daily_returns)
msft_daily_returns = daily_returns(hist_msft)
msft_daily_returns = np.ravel(msft_daily_returns)
#adjusting for different trading periods
adjvalue = min(len(aapl_daily_returns),len(msft_daily_returns))
aapl_adj = adjusting(aapl_daily_returns)
msft_adj = adjusting(msft_daily_returns)
#annual returns
aapl_ann_returns = annual_returns(hist_aapl)
msft_ann_returns = annual_returns(hist_msft)
#inputs for optimization
cov_mat = np.cov([aapl_adj,msft_adj])*252
ann_returns = np.concatenate((aapl_ann_returns,msft_ann_returns))
Now I just want the code to work with a various, unknown number of inputs. I tried reading a lot about global variables or tried to figure it out with dictionaries but couldn't really achieve any progress.
I think using the for loop can solve your problem!
...
names = input('Stock codes:')
names = names.split()
for name in names:
#analyze here
#I don't know anything about stocks so I wont write anything here
...

Get time from city name using Python

As you can see in the title, I want to find the time of given city in Python. How can I achieve this? I've tried geopy and timezonefinder modules but they are giving me different results too. (like 'What time is it in Spotify?', 'It's 12:04')
What I'm trying to achieve is:
What time is it in California?
It's 16:15
THE CODE
import nltk
import datetime
import calendar
import pytz
from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder
self.inp = input("City name: ")
# Find city name using NLP
# Get city name
findCityName = str(self.inp.title())
# NLP
word = nltk.word_tokenize(findCityName)
pos_tag = nltk.pos_tag(word)
chunk = nltk.ne_chunk(pos_tag)
self.inp = [ " ".join(w for w, t in ele) for ele in chunk if isinstance(ele, nltk.Tree)]
self.inp = ' '.join(self.inp)
# Get lat, long from city name
geolocator = Nominatim(user_agent='xxx')
location = geolocator.geocode(self.inp.capitalize())
# Get timezone from coordinates
tf = TimezoneFinder()
latitude, longitude = location.latitude, location.longitude
# Timezone
datez = tf.timezone_at(lng=longitude, lat=latitude)
datez = str(datez)
globalDate = datetime.datetime.now(pytz.timezone(datez))
print("The date in " + str(self.inp) + " is: " + globalDate.strftime('%A, %m/%d/%y'))

Data generation incomplete: Python random

I am trying to write a script to generate data. I am using random package for this. I execute the script and everything works fine. But when I check through the results, I found out that the script fails to generate the last 100+ rows for some reason.
Can someone suggest me why this could be happening?
from __future__ import print_function
from faker import Faker;
import random;
## Vaue declaration
population = 3;
product = 3;
years = 3;
months = 13;
days = 30;
tax= 3.5;
## Define Column Header
Column_Names = "Population_ID",";","Product_Name",";","Product_ID",";","Year",";",
"Month",";","Day","Quantity_sold",";","Sales_Price",";","Discount",
";","Actual_Sales_Price",tax;
## Function to generate sales related information
def sales_data():
for x in range(0,1):
quantity_sold = random.randint(5,20);
discount = random.choice(range(5,11));
sales_price = random.uniform(20,30);
return quantity_sold,round(sales_price,2),discount,round((sales_price)-(sales_price*discount)+(sales_price*tax));
## Format the month to quarter and return the value
def quarter(month):
if month >= 1 and month <= 3:
return "Q1";
elif month > 3 and month <= 6:
return "Q2";
elif month > 6 and month <= 9:
return "Q3";
else:
return "Q4";
## Generate product_id
def product_name():
str2 = "PROD";
sample2 = random.sample([1,2,3,4,5,6,7,8,9],5);
string_list = [];
for x in sample2:
string_list.append(str(x));
return (str2+''.join(string_list));
### Main starts here ###
result_log = open("C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv",'w')
print (Column_Names, result_log);
### Loop and Generate Data ###
for pop in range(0,population):
pop = random.randint(55000,85000);
for prod_id in range(0,product):
product_name2 = product_name();
for year in range(1,years):
for month in range(1,months):
for day in range(1,31):
a = sales_data();
rows = str(pop)+";"+product_name2+";"+str(prod_id)+";"+str(year)+";"+str(month)+";"+quarter(month)+";"+str(day)+";"+str(a[0])+";"+str(a[1])+";"+str(a[2])+";"+str(tax)+";"+str(a[3]);
print(rows,file=result_log);
#print (rows);
tax = tax+1;
You need to close a file to have the buffers flushed:
result_log.close()
Better still, use the file object as a context manager and have the with statement close it for you when the block exits:
filename = "C:/Users/Sangamesh.sangamad/Dropbox/Thesis/Data Preparation/GenData.csv"
with result_log = open(filename, 'w'):
# code writing to result_log
Rather than manually writing strings with delimiters in between, you should really use the csv module:
import csv
# ..
column_names = (
"Population_ID", "Product_Name", "Product_ID", "Year",
"Month", "Day", "Quantity_sold", "Sales_Price", "Discount",
"Actual_Sales_Price", tax)
# ..
with result_log = open(filename, 'wb'):
writer = csv.writer(result_log, delimiter=';')
writer.writerow(column_names)
# looping
row = [pop, product_name2, prod_id, year, month, quarter(month), day,
a[0], a[1], a[2], tax, a[3]]
writer.writerow(row)

Categories