Create directory inside today created (datetime) directory - python

I m new in python and also in stackoverflow even if I sometimes I read it for various issue...
These days I m approaching to python3 cause I would like to improve a simple procedure: everyday I need to download data from an ubuntu system to win 10 and create the relative path but this already happens and works fine.
I just want to add a function in this program (made from other people) and create a folder inside the folder created and named with the date of the day (so everyday this folder has a different name!).
import paramiko
import stat
import os
import pandas as pd
#from tqdm.notebook import tqdm
from tqdm import tqdm
import logging
from logging_config import getJupyterHandler
logging.basicConfig(level=logging.INFO, handlers=[getJupyterHandler()])
logging.getLogger('paramiko.transport').setLevel(logging.WARNING)
logging.getLogger('paramiko.transport.sftp').setLevel(logging.WARNING)
import datetime
import utils
root = '/...path'
date = None
overwrite_existing = False
if date is None:
date = datetime.date.today().strftime("%Y%m%d")
logging.info("Checking data for today")
else:
logging.warning("Manual date is set")
sftp = utils.getPandoraSFTP(ip='...')
dates = sftp.listdir(root)
if date not in dates:
logging.error("No folder found")
else:
logging.info("Folder found")
files, numberOfFiles = utils.getFileListToCopy(sftp, f"{root}/{date}", f"C:\\data\\to_upload\\
{date}", f"C:\\data\\to_brighter\\{date}", True)
logging.info("Download tags")
tags = {k:v for k,v in files.items() if 'tag/' in k}
if len(tags)>0:
for remote, local in tags.items():
if os.path.exists(local) == False or overwrite_existing:
sftp.get(remote, local)
logging.info("Create summary table")
folder = os.path.dirname(list(tags.values())[0])
df = pd.DataFrame(columns=['id', 'Time', 'type' ,'Function', 'Leveling', 'Weather',
'Illumination', 'visibility', 'Road Type', 'sky', 'other', 'Issue description', 'driver',
'tester'])
for file in os.listdir(folder):
if file.lower().endswith('csv'):
try:
df = df.append(pd.read_csv(f"{folder}\\{file}", header=None,
names=df.columns))
except Exception as e:
logging.error(f"Unable to process tag: {file} due to {e}")
df['Location'] = ''
filename = folder.split('\\')[-2]
summary_path = f'...'
df[['Time', 'Function', 'Road Type', 'Illumination', 'Weather', 'Location', 'Issue
description']].to_excel(summary_path, index=False, header=False)
logging.info("Table created on ")
else:
logging.warning(f"No tags found")
pbar = tqdm(files.items())
for remote, local in pbar:
pbar.set_postfix_str(os.path.basename(remote))
if os.path.exists(local) == False or overwrite_existing:
sftp.get(remote, local)
#making new folder
folder_n = "name of folder..."
os.chdir ("C:\\data\\to_upload\\{date}") #choose directory
os.mkdir (folder_n)
How you can see in these lasts strings (#making new folder) I simply added a mkdir function for create the folder inside the folder {date}. Of course the error says that it does not find that path!
Could someone help me and suggest a way on how identify that folder?
Thanks

You wrote
os.chdir("C:\\data\\to_upload\\{date}")
but you wanted
os.chdir(f"C:\\data\\to_upload\\{date}")
That is, you want an f-string to interpolate the date variable
so its value will become part of the string.
You'd be better off phrasing it as a raw r-string
os.chdir(rf"C:\data\to_upload\{date}")
rather than doubling the \ backwhacks.
Better still, just use regular / slashes:
os.chdir(f"C:/data/to_upload/{date}")

thank you very much, it works!!
So I missed that "f"! I was sure paths must be wrote just inside quotation but I will study in deep this aspect.

Related

Python: always import the last revision in the directory

Imagine that we have the following Data Base structure with the data stored in python files ready to be imported:
data_base/
foo_data/
rev_1.py
rev_2.py
bar_data/
rev_1.py
rev_2.py
rev_3.py
In my main script, I would like to import the last revision of the data available in the folder. For example, instead of doing this:
from data_base.foo_data.rev_2 import foofoo
from data_base.bar_data.rev_3 import barbar
I want to call a method:
import_from_db(path='data_base.foo_data', attr='foofoo', rev='last')
import_from_db(path='data_base.bar_data', attr='barbar', rev='last')
I could take a relative path to the Data Base and use glob.glob to search the last revision, but for this, I should know the path to the data_base folder, which complicates things (imagine that the parent folder of the data_base is in sys.path so the from data_base.*** import will work)
Is there an efficient way to maybe retrieve a full path knowing only part of it (data_base.foo_data)? Other ideas?
I think it's better to install the last version.
but going on with your flow, you may use getattr on the module:
from data_base import foo_data
i = 0
while True:
try:
your_module = getattr(foo_data, f'rev_{i}')
except AttributeError:
break
i += 1
# Now your_module is the latest rev
#JohnDoriaN 's idea led me to a quite simple solution:
import os, glob
def import_from_db(import_path, attr, rev_id=None):
"""
"""
# Get all the modules/folders names
dir_list = import_path.split('.')
# Import the last module
exec(f"from {'.'.join(dir_list[:-1])} import {dir_list[-1]}")
db_parent = locals()[dir_list[-1]]
# Get an absolute path to corresponding to the db_parent folder
abs_path = db_parent.__path__._path[0]
rev_path = os.path.join(abs_path, 'rev_*.py')
rev_names = [os.path.basename(x) for x in glob.glob(rev_path)]
if rev_id is None:
revision = rev_names[-1]
else:
revision = rev_names[rev_id]
revision = revision.split('.')[0]
# import attribute
exec(f'from {import_path}.{revision} import {attr}', globals())
Some explanations:
Apparently (I didn't know this), we can import a folder as a module; this module has a __path__ attribute (found out using the built-in dir method).
glob.glob allows us to use regex expressions to search for a required pattern for files in the directory.
using exec without parameters will import only in the local namespace (namespace of the method) so without polluting the global namespace.
using exec with globals() allows us to import in the global namespace.

Take my data from my computer and Verify that data is not stolen

As a Python programmer recently I received a small project to edit and add some functions. (the project is Python/Django)
But while I was working on it, I noticed something unusual, which is the presence of some python libraries(hashlib and others), which can take my data(Gmail accounts, passwords, chrome bookmarks . . .) from the computer.
This Script is an example from the code.
import hashlib
import logging
import re
import pandas as pd
from file import reader
logger = logging.getLogger(__name__)
def load_data(user_id, project_id, columns: dict):
group_files = {}
df = pd.DataFrame(None)
for id_, column in columns.items():
group = column['group']
if group not in group_files:
df_file = reader.load_file(user_id, project_id, group)
group_files[group] = df_file
if group_files[group] is not None:
if column['content'] in group_files[group].columns:
df[column['content']] = group_files[group][column['content']]
return df
def get_hash(string: str):
return hashlib.md5(string.encode()).hexdigest()[:5]
My question is: How can I know if they are taking my data from the computer or not?
Thanks in Advance.

How to Import column from xlsx > Create Json Array > Update SQLite DB

I need help converting the xlsx or a csv file into something that looks like the example below so that I could import it into SQLite.
Stocks_Update = [
{'stock_ticker' : 'TSLA'},
{'stock_ticker' : 'MSFT'},
{'stock_ticker' : 'AAPL'},
{'stock_ticker' : 'GOOG'},
{'stock_ticker' : 'FB'},
{'stock_ticker' : 'SQ'},
]
The columns looks like this in excel
I have found a way to import it using
import pandas as pd
from pandas import ExcelWriter
from tkinter import Tk
from tkinter.filedialog import askopenfilename
root = Tk()
ftypes = [(".xlsm","*.xlsx",".xls")]
ttl = "Title"
dir1 = 'C:\\'
filePath=r"C:\Users\home\Desktop\code_projects\FIRE_Dashboard\stock_ticker.xlsx"
#filePath = askopenfilename(filetypes = ftypes, initialdir = dir1, title = ttl)
The tutorial where I found this makes the imported file into a pandas dataframe. But I need to make that array instead.
stocklist = pd.read_excel(filePath)
stocklist=stocklist.head()
exportList= pd.DataFrame(columns=['Stock', "RS_Rating", "50 Day MA", "150 Day Ma", "200 Day MA", "52 Week Low", "52 week High"])
So that I could import it into SQLite using
def update_Stocks():
for data_point in Stocks_Update:
try:
Stocks.create(stock_ticker=data_point['stock_ticker'])
except IntegrityError:
update_record = Stocks.get(stock_ticker=data_point['stock_ticker'])
update_record.stock_ticker = data_point['stock_ticker']
update_record.save()
I am completely lost on where to even get started, as I am not even sure if the thing I need as a final result is a json array or not but it looks similar. Any help on how to do this is much appreciated along with helping a noobie with the correct terminology. Maybe if I knew what it was called I would be able to find a guide for it online.
Anyway thanks again :)
Wasn't able to find a way on how to do it with python code but found a workaround with
https://www.sqlitetutorial.net/sqlite-import-csv/

Python package with sample datasets but deferred download?

I have a data analysis tool that I made a Python package for and I'd like to include some sample datasets, but I don't want to include all the datasets directly in the Python package because it will bloat the size and slow down install for people who don't use them.
The behavior I want is when a sample dataset is referenced it automatically gets downloaded from a URL and saved to the package locally, but then the next time it is used it will read the local version instead of re-downloading it. And this caching should persist permanently for my package, not only the during of the Python instance.
How can I do this?
I ended up making a folder under AppData using the appdirs package
datasets.py
import os
import pandas as pd
from pandasgui.utility import get_logger
from appdirs import user_data_dir
from tqdm import tqdm
logger = get_logger(__name__)
__all__ = ["all_datasets",
"country_indicators",
"us_shooting_incidents",
"diamonds",
"pokemon",
"anscombe",
"attention",
"car_crashes",
"dots",
"exercise",
"flights",
"fmri",
"gammas",
"geyser",
"iris",
"mpg",
"penguins",
"planets",
"tips",
"titanic",
"gapminder",
"stockdata"]
dataset_names = [x for x in __all__ if x != "all_datasets"]
all_datasets = {}
root_data_dir = os.path.join(user_data_dir(), "pandasgui", "dataset_files")
# Open local data CSVs if they exists
if all([os.path.exists(os.path.join(root_data_dir, f"{name}.csv")) for name in dataset_names]):
for name in dataset_names:
data_path = os.path.join(root_data_dir, f"{name}.csv")
if os.path.isfile(data_path):
all_datasets[name] = pd.read_csv(data_path)
# Download data if it doesn't exist locally
else:
os.makedirs(root_data_dir, exist_ok=True)
logger.info(f"Downloading PandasGui sample datasets into {root_data_dir}...")
pbar = tqdm(dataset_names, bar_format='{percentage:3.0f}% {bar} | {desc}')
for name in pbar:
pbar.set_description(f"{name}.csv")
data_path = os.path.join(root_data_dir, f"{name}.csv")
if os.path.isfile(data_path):
all_datasets[name] = pd.read_csv(data_path)
else:
all_datasets[name] = pd.read_csv(
os.path.join("https://raw.githubusercontent.com/adamerose/datasets/master/",
f"{name}.csv"))
all_datasets[name].to_csv(data_path, index=False)
# Add the datasets to globals so they can be imported like `from pandasgui.datasets import iris`
for name in all_datasets.keys():
globals()[name] = all_datasets[name]

Can azure data lake files be filtered based on Last Modified time using azure python sdk?

I am trying to perform in-memory operations on files stored in azure datalake. I am unable to find documentation regarding using a matching pattern without using the ADL Downloader.
For a single file, this is the code I use
filename = '/<folder/<filename>.json'
with adlsFileSystemClient.open(filename) as f:
for line in f:
<file-operations>
But how do we filter based on filename (string matching) or based on last modified date.
When I used U-SQL , I had the option to filter the fileset based on the last modified option.
DECLARE EXTERNAL #TodaysTime = DateTime.UtcNow.AddDays(-1);
#rawInput=
EXTRACT jsonString string,
uri = FILE.URI()
,modified_date = FILE.MODIFIED()
FROM #in
USING Extractors.Tsv(quoting : true);
#parsedInput=
SELECT *
FROM #rawInput
WHERE modified_date > #TodaysTime;
Is there any similar options to filter the files modified during a specified period when using adlsFileSystemClient?
Github Issue: https://github.com/Azure/azure-data-lake-store-python/issues/300
Any help is appreciated.
Note:
This question was answered by akharit in GitHub recently. I am providing his answer below which solves my requirement.
**There isn't any in build functionality in the adls sdk itself as there is no server side api that will return only files modified with the last 4 hours.
It should be easy to write the code to do that after you get the list of all entries.
The modification time field returns milliseconds since unix epoch, which you can convert to a python datetime object by
from datetime import datetime, timedelta
datetime.fromtimestamp(file['modificationTime'] / 1000)
And then something like
filtered = [file['name'] for file in adl.ls('/', detail=True) if (datetime.now() - datetime.fromtimestamp(file['modificationTime']/1000)) > timedelta(hours = 4)]
You can use walk instead of ls for recursive enumeration as well.
**
Based on the below code , You can find the container level directories and file names with file properties including the last_modified data as well . So you can control the file based on the last_modified date .
from pyspark.sql.functions import col
from azure.storage.blob import BlockBlobService
from datetime import datetime
block_blob_service = BlockBlobService(account_name='acccount_name', account_key='account-key')
container_name ='Contaniner_name'
second_conatainer_name ='Contaniner_name_second'
#block_blob_service.create_container(container_name)
generator = block_blob_service.list_blobs(container_name,prefix="Recovery/")
report_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
myfile = open('/dbfs/adlsaudit/auditfiles2', 'w')
for blob in generator:
length = BlockBlobService.get_blob_properties(block_blob_service,container_name,blob.name).properties.content_length
last_modified = BlockBlobService.get_blob_properties(block_blob_service,container_name,blob.name).properties.last_modified
file_size = BlockBlobService.get_blob_properties(block_blob_service,container_name,blob.name).properties.content_length
# print("\t Recovery: " + blob.name,":" +str(length),":" + str(last_modified))
line = container_name+'|'+second_conatainer_name+'|'+blob.name+'|'+ str(file_size) +'|'+str(last_modified)+'|'+str(report_time)
myfile.write(line+'\n')
myfile.close()

Categories