How to compare an installation path with a wildcarded application name in python - python

I want to traverse a dataframe in python and, by this, to get rid of the full installation path. So only the application name, e.g. firefox.exe or firefox shall be explicated and not "C:\Program Files\Mozilla Firefox\firefox.exe".
I created a function which takes the respective dataframe and a dcitonary containing key value pairs with a application name as key (firefox) and the wildcarded path as value (*firefox.exe).
def transform_process_name(mid_result_df, name_dict):
for row, col in mid_result_df.iterrows():
for name, value in name_dict.items():
print(name)
print(value)
if col['ProcessName'] == value:
col['ProcessName'] = name
return mid_result_df
this is called in a function as follows:
transform_process_name(mid_result_df, __name_of_processes)
where
__name_of_processes =
{
'firefox': '*firefox.exe',
}
The comparison does not work. so the output is still "C:\Program Files\Mozilla Firefox\firefox.exe" and not "firefox".
Thank you :)
EDIT:
It now works with
def transform_process_name(mid_result_df, name_dict):
for row, col in mid_result_df.iterrows():
for name, value in name_dict.items():
string_check = col['ProcessName']
if string_check.endswith(value):
mid_result_df.at[row, 'ProcessName'] = name
return mid_result_df

I don't think you can use == to compare "C:\Program Files\Mozilla Firefox\firefox.exe" and "*firefox.exe". I think you probably want to replace
if col['ProcessName'] == value:
with
if value in col['ProcessName']:
or
if col['ProcessName'].endswith(value):
and value should be 'firefox.exe'.
EDIT:
This is my code:
import pandas as pd
def transform_process_name(mid_result_df, name_dict):
for row, col in mid_result_df.iterrows():
for name, value in name_dict.items():
if value in col['ProcessName']:
# if col['ProcessName'].endswith(value):
col['ProcessName'] = name
else:
col['ProcessName'] = col['ProcessName']
return mid_result_df
mid_result_df = pd.DataFrame({'ProcessName': ['C:\\Program Files\\Mozilla Firefox\\firefox.exe']})
name_dict = {'firefox': 'firefox.exe'}
result = transform_process_name(mid_result_df, name_dict)
print(result)

Maybe I did not understand well the problem, but if you have a dataframe containing the paths to each application, why don't you use the native functions in the os module to extract the name of the application themselves?
For example:
import pandas as pd
import os
def get_application(path):
return os.path.splitext(os.path.basename(path))[0]
path = "C:/Program Files/Mozilla Firefox/"
df = pd.DataFrame([os.path.join(path + "firefox.exe"), os.path.join(path + "myapp.exe")],
columns=["full_path"])
df["application"] = df["full_path"].apply(get_application)
print(df)
Here, I created a dummy dataframe with paths to the applications. Then I define a function that given a path, returns only the app name (basename) without the dot (splitext).
The apply method applies the function to each element of the dataframe, and the output is stored in a new column.
Result:
full_path application
0 C:/Program Files/Mozilla Firefox/firefox.exe firefox
1 C:/Program Files/Mozilla Firefox/myapp.exe myapp

Related

Call function and parameters based on condition in Python

i have written a function in Python that reads either a .csv or .xls and return it as a pandas dataframe.
Based on the passed file_type the function uses either the pandas.read_csv() or pandas.read_excel() function with (just one slight) difference in the parameters.
It works without an issue but its obviously repeated code i would like to reduce.
So how could i best:
Have just one function call that is dynamically changed to the specific one defined by the file_type variable
Dynamically change the parameters of the then called function based on the same variable?
Here is my current code.
Thanks for your help.
def file_to_df(file_name, fields= None, file_type = None, encoding = None):
"""Read stock level from csv or xlsx file.Filter SKU and Qty.Return dataframe."""
if file_type == 'csv' or 'xls':
if file_type == 'csv':
data_frame = pd.read_csv(
file_name,
encoding = encoding,
converters={'Barcode':str,'Qty':int},
usecols=fields
)
elif file_type == 'xls':
data_frame = pd.read_excel(
file_name,
converters={'Barcode':str,'Qty':int},
usecols=fields
)
# Remove empty rows
data_frame.replace('', np_nan, inplace=True)
data_frame.dropna(axis=0, how='any', subset=None, inplace=True)
return data_frame
else:
print('no csv or xls filetype was handed to file_to_df')
For the parameters i tried using two tuples that are put into the function call.
You can modify your signature function and use the keyword-only arguments (PEP 3102). After that, create a dict of parameters, add your fixed parameters (converters), rename some parameters (fields -> usecols) and add other parameters as it:
import pandas as pd
import pathlib
def file_to_df(file_name, **kwargs):
xfile = pathlib.Path(file_name)
params = {
'converters': {'Barcode': str, 'Qty': int}, # add fixed parameters
'usecols': kwargs.pop('fields', None) # convert fields to usecols
} | kwargs # pass all other parameters as it
# determine the right function according the extension
funcs = {'.csv': pd.read_csv, '.xlsx': pd.read_excel}
try:
df = funcs[xfile.suffix](xfile, **params)
except KeyError:
raise RuntimeError('no csv or xls filetype was handed to file_to_df')
return df
Don't pass a string that has to be mapped to a particular function; just pass the correct function.
def file_to_df(file_name, fields=None, *, converter, **kwargs):
"""Read stock level from csv or xlsx file.Filter SKU and Qty.Return dataframe."""
data_frame = converter(file_name, , converters={'Barcode': str, 'Qty': int}, usecols=fields, **kwargs)
data_frame.replace('', np_nan, inplace=True)
data_frame.dropna(axis=0, how='any', subset=None, inplace=True)
return data_frame
df1 = file_to_df('foo.csv', converter=pd.read_csv)
df2 = file_to_df('foo.xlsx', converter=pd.read_excel, encoding='...')

Xlookup with panda in Python

I am very new to Python and i would like to use xlookup in order to look for values in different columns (column"Debt", column "Liquidity" etc..) in a database
And fill the value in the cells (C17, C18, c19....) of a number of destination files which have the same format
path_source= r"C:\Test source.xlsx"
destination_file= r"C:Stress Test Q4 2022\test.xlsx"
df1 = pd.read_excel(path_source)
df2= pd.read_excel(destination_file)
def xlookup(lookup_value, lookup_array, return_array, if_not_found:str = ''):
match_value = return_array.loc[lookup_array == lookup_value]
if match_value.empty:
return f'"{lookup_value}" not found!' if if_not_found == '' else if_not_found
else:
return match_value.tolist()[0]
df2.iloc[2,17]= df1["debt"].apply(xlookup, args = (main_df1["Fund name"],main_df1["fund_A"] ))
NameError: name 'main_df1' is not defined
can anyone help correct the code ?
thanks a lot

Class that returns a transformed dataframe

I'm trying to create a class that takes the path and name of the CSV file, converts it to a dataframe, deletes some columns, converts another one to datetime, as in the code
import os
from pathlib import Path
import pandas as pd
import datetime
class Plans:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.Dataframe()
def get_dataframe(self):
os.chdir(self.path)
self.df = pd.read_csv(self.file, encoding="latin-1", low_memory=False, sep=';')
if 'data' in df.columns:
self.tipo = 'sales'
self.df['data'] = pd.to_datetime(df['data'])
return clean_unused_data()
def clean_unused_data(self):
columns = ['id', 'docs', 'sequence','data_in','received', 'banc', 'return', 'status', 'return_cod',
'bank_account_return', 'id_transcript', 'id_tx','type_order']
for item in columns:
del self.df[item]
del columns[:]
return self.df
When I call an object of the class it gives an error with the clean_unused_data function
returns the following error:
__getattr__ raise AttributeError(f"module 'pandas' has no attribute '{name}'")
Also, I would like to do more dataframe transformations in the Plans class. but since this first one failed, I was a little lost.
Thanks for the help and I apologize for the lack of intimacy with python
I think the error refers to calling an attribute that does not exist in Pandas. From what I can see you wrote pd.DataFrame as pd.Dataframe. Notice the capitalization.
Try the following:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.DataFrame()
Probably one of the columns you are trying to delete is not actually in your file. You can handle the exception or remove this column label from your array.

Make directory tree of sys.path from dict and list

I am trying to understand sys.path.
So I want to make code that returns directory tree like this,but I can't.
Can someone please tell me the code?
【sys.path】
['C:\\Users\\81802\\PycharmProjects\\PlayGround',
'C:\\Users\\81802\\AppData\\Local\\Programs\\Python\\Python37\\python37.zip',
'C:\\Users\\81802\\AppData\\Local\\Programs\\Python\\Python37\\DLLs',
'C:\\Users\\81802\\AppData\\Local\\Programs\\Python\\Python37\\lib',
'C:\\Users\\81802\\AppData\\Local\\Programs\\Python\\Python37',
'C:\\Users\\81802\\PycharmProjects\\PlayGround\\venv',
'C:\\Users\\81802\\PycharmProjects\\PlayGround\\venv\\lib\\site-packages',
'C:\\Users\\81802\\PycharmProjects\\PlayGround\\venv\\lib\\site-packages\\setuptools-39.1.0-py3.7.egg',
'C:\\Users\\81802\\PycharmProjects\\PlayGround\\venv\\lib\\site-packages\\pip-10.0.1-py3.7.egg']
【directory tree(dict)】
{'C:\\Users\\81802\\':
[{'PycharmProjects\\PlayGround\\':
['',
{'venv\\':
['',
{'lib\\site-packages\\':
['',
'setuptools-39.1.0-py3.7.egg',
'pip-10.0.1-py3.7.egg']}]}]},
{'AppData\\Local\\Programs\\Python\\Python37\\':
['',
'python37.zip',
'DLLs',
'lib']}]}
This is the simplest I can get. The idea is to maintain a set of paths which didn't currently diverge.
import sys
from pprint import pprint
pprint(sys.path)
sep = '\\'
# check if all paths agree on the current name
def isSameName(paths, index):
for path in paths:
if index >= len(path) or path[index] != paths[0][index]:
return False
return True
#transform the current set of paths into tree
def toTree(paths, startIndex):
index = startIndex
if len(paths) == 1:
return sep.join(paths[0][index:])
while isSameName(paths, index):
index += 1
nameMap = dict()
for path in paths:
name = path[index] if len(path) > index else 0
if not (name in nameMap):
nameMap[name] = []
nameMap[name].append(path)
res = [toTree(paths, index) for paths in nameMap.values()]
return { sep.join(paths[0][startIndex:index]) : res}
paths = [path.split(sep) for path in sys.path]
pprint(toTree(paths, 0))
This will give you a dictionary where every key is a directory, and the values are lists of either file names or dictionaries with a subdirectory.
import os
def get_files_dict(startpath):
tree = [] # this is the array of subdirectory objects
for item in os.listdir(startpath):
# we need to have a full path to the item in the directory
item_path = os.path.join(startpath, item)
if os.path.isfile(item_path):
tree.append(item)
else:
# we call this function recursively for subdirectories
tree.append(get_files_dict(item_path))
return {os.path.basename(startpath):tree}
file_tree = get_files_dict(os.getcwd())
# this is just a helper function to print the tree nicely
def print_tree(d,i=0):
for k,v in d.items():
print("{}{}".format(" "*4*i, k+os.sep))
for l in v:
if type(l) is dict:
print_tree(l,i+1)
else:
print("{}{}".format(" "*4*(i+1), l))
print_tree(file_tree)
And the printed output:
runner/
.bashrc
.bash_logout
.profile
.site-packages/
main.py
.config/
pycodestyle
_test_runner.py
This was inspired by this SO issue, but I changed quite a bit about the implementation.

Sort strings by year,month and day

I'm quite new to Python.
Basic problem:
Extract a sorted list of file names by year, month and day from directory
I'm writing a script that search for the latest bak
file in directory and importing it to database.
Files have the following format:
MyDB_05-09-2017.bak
MyDB_05-10-2017.bak
I wish to extract the most up to date bak file by the file name.
I want to sort the files by year,month and day.
This is some basic implementation I have tried:
import glob,os,re
from datetime import datetime
# SQL server backup directory
os.chdir('C:\\SQLServerBackups')
# invoke the sql script to drop the database if exists
os.system('C:\\SQLServerBackups\\database_sql_scripts\\drop_database.bat')
# find the latest .bak file and rename it to target
file_list = glob.glob('*.bak')
latest_bak_file = file_list[0]
latest_year = 0
latest_month = 0
latest_day = 0
for file in file_list:
print(file)
del_list = re.split('[-._]',file)
temp_latest_year = int(del_list[3])
temp_latest_month = int(del_list[1])
temp_latest_day = int(del_list[2])
if temp_latest_year > latest_year:
latest_year = temp_latest_year
latest_month = temp_latest_month
latest_day = temp_latest_day
elif temp_latest_year == latest_year:
if temp_latest_month > latest_month:
latest_month = temp_latest_month
latest_day = temp_latest_day
elif temp_latest_month == latest_month:
if temp_latest_day > latest_day:
latest_day = temp_latest_day
latest_bak_file = file
print(latest_bak_file)
Any advice on how can I implement it better?
I wish to have a sorted list of file names by year,month and day.
You can just define a sort key function that returns the fields you want to sort by:
import re
fnames = [
"MyDB_05-10-2017.bak",
"MyDB_05-09-2017.bak",
]
def sortkey(x):
parts = re.split('[-._]', x)
return [int(parts[3]), int(parts[1]), int(parts[2])]
sorted_fnames = sorted(fnames, key=sortkey)
or, as #Klaus D said, use datetime.strptime in your search key:
sorted_fnames = sorted(fnames, key=lambda x: datetime.datetime.strptime(x, 'MyDB_%d-%m-%Y.bak'))

Categories