Selecting files based on creation date - python

I have a folder named myclientcard and it has 69 subfolders in that subfolders we have number of subfolders where it has to go to error folder and inside error folder it has number of txt files, So I want the contents of those text file of all 69 folders inside error inside the specified using the date format 17/01/2019 to 24/01/2019 and convert it into excel file
import os
import numpy as np
from os import listdir
from os.path import join
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
mypath = "D:\myclientcard"
files = [join(mypath,f) for f in listdir(mypath) if '.txt' not in f]
for file in files:
path = file
filename =[join(path,f) for f in listdir(path) if 'ERROR' in f]
#print(filename)
for text_file_path in filename:
file_path = text_file_path
textfiles = [join(file_path,f) for f in listdir(file_path) if '.txt' in f]
for files in textfiles:
reading_files = open(files,'r')
read = reading_files.read()
writting_files = open('result.txt','a')
wr = writting_files.write(read)
read_files = pd.read_csv('result.txt',delim_whitespace='')
writer = ExcelWriter('output.xlsx')
read_files.to_excel(writer,'Sheet1',index=false)
writer.save()
reading_files.close()
writting_files.close()

Using the answers from here and here. Assuming you are on a windows platform.
import os
import numpy as np
from os import listdir
from os.path import join
# Importing datetime module
from datetime import datetime as dt
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
mypath = "D:\myclientcard"
# Add start date here
start_date = dt.strptime('17/01/2019', '%d/%m/%Y')
# Add end date here
end_date = dt.strptime('24/01/2019', '%d/%m/%Y')
files = [join(mypath,f) for f in listdir(mypath) if '.txt' not in f]
for file in files:
path = file
filename =[join(path,f) for f in listdir(path) if 'ERROR' in f]
#print(filename)
for text_file_path in filename:
file_path = text_file_path
textfiles = [join(file_path,f) for f in listdir(file_path) if '.txt' in f]
# Filtering on the basis of date
textfiles = [f for f in textfiles if ((os.path.getctime(f) >= start_date) and (os.path.getctime(f) <= end_date))]
for files in textfiles:
reading_files = open(files,'r')
read = reading_files.read()
writting_files = open('result.txt','a')
wr = writting_files.write(read)
read_files = pd.read_csv('result.txt',delim_whitespace='')
writer = ExcelWriter('output.xlsx')
read_files.to_excel(writer,'Sheet1',index=false)
writer.save()
reading_files.close()
writting_files.close()
On a side note, consider optimizing your code. Also try os.walk, it can be useful at times!

Related

Outputting or reading columns in a different order using pandas/python

I have a Python script that reads a CSV file and outputs it to HTML.
I would like to change the order of the columns once written to the HTML file, but I don't know if I should do this on read or write or what function to use, here is my code so far.
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['OrderNo','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
html_table = csv.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)
Any help, greatly appreciated.
Thanks.
I've been looking at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_html.html, but can't quite find out how to re-order the columns.
Edit,
Changes to get the solution are,
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#python can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['Order No','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
cols = list(csv.columns)
a, b, c, d, e, f, g = cols.index('Order No'), cols.index('Req Qty'), cols.index('Planned Start'), cols.index('Resource'), cols.index('Op'), cols.index('Part'), cols.index('Desc')
cols[a], cols[b], cols[c], cols[d], cols[e], cols[f], cols[g] = cols[a], cols[e], cols[f], cols[g], cols[c], cols[b], cols[d]
df = csv[cols]
html_table = df.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)

how can i retrive images with specific name?

I have a folder containing 82.000 images and a file containing 50.000 names of the images I need, how can I use the name of the file to retrieve the image itself
I tried this
import os, os.path
import fnmatch
from os import listdir
from os.path import isfile, join
import shutil
from shutil import copyfile
from pathlib import Path
dst="/new_folder"
scr="/dataset"
id_ = "name_image.txt"
files = os.listdir(scr)
for img_filename in os.listdir(scr):
x = os.path.splitext(img_filename)[0].split('jpeg')
print(x)
with open(id_) as f:
lines = f.readlines()
for lines in os.listdir(scr):
if lines in x:
shutil.move(os.path.join(scr,lines), dst)
but not work
the file contains names in this format
dataset_000000112915
dataset_000000112941
try this
import glob
import os
dst="/new_folder"
scr="/dataset"
id_ = "name_image.txt"
with open(id_) as f:
lines = [x.replace('\n','') for x in f.readlines()]
for f in glob.glob(os.path.join(scr, '*.jpeg'), recursive=True):
if f.split(os.sep)[-1] in lines:
os.rename(f, os.path.join(dst, f.split(os.sep)[-1]))

reading multiple csv file from a different directory in python

import csv
import pandas
df_list = []
path = "C:/Users/bubai/Desktop/try/scrapy/output"
#all csv file
for file in os.listdir(path):
#print(file)
df_list.append(file) # all csv file in this
#print(df_list)
for i in df_list:
df = pandas.read_csv(i) # open one by one
print(df)
I have some error:-FileNotFoundError: [Errno 2] File b'poem1.csv' does not exist: b'poem1.csv'
file name are saved like poem1.csv
poem10.csv
poem11.csv
poem12.csv
poem13.csv
poem14.csv
poem15.csv
poem16.csv
poem17.csv
poem18.csv
poem19.csv
poem2.csv
poem20.csv
You need to append the filename to the path.
import csv
import pandas
import os
df_list = []
path = "C:/Users/bubai/Desktop/try/scrapy/output"
#all csv file
for file in os.listdir(path):
df_list.append(os.path.join(path,file)) # all csv file in this
#print(df_list)
for i in df_list:
df = pandas.read_csv(i) # open one by one
print(df)
You need to concatenate the directory name with the filename in order to refer to the file.
import os
df = pandas.read_csv(os.path.join(path, i)

Speed up parsing of gzipped jsonlines files

I have about 5,000 .gzip files (~1MB each). Each of these files contains data in a jsonlines format. Here's what it looks like:
{"category_id":39,"app_id":12731}
{"category_id":45,"app_id":12713}
{"category_id":6014,"app_id":13567}
I want to parse these files and convert them to a pandas dataframe. Is there a way to speed up this process? Here's my code but it's kinda slow (0.5s per file)
import pandas as pd
import jsonlines
import gzip
import os
import io
path = 'data/apps/'
files = os.listdir(path)
result = []
for n, file in enumerate(files):
print(n, file)
with open(f'{path}/{file}', 'rb') as f:
data = f.read()
unzipped_data = gzip.decompress(data)
decoded_data = io.BytesIO(unzipped_data)
reader = jsonlines.Reader(decoded_data)
for line in reader:
if line['category_id'] == 6014:
result.append(line)
df = pd.DataFrame(result)
This should allow you to read each line without loading the whole file.
import pandas as pd
import json
import gzip
import os
path = 'data/apps/'
files = os.listdir(path)
result = []
for n, file in enumerate(files):
print(n, file)
with gzip.open(f'{path}/{file}') as f:
for line in f:
data = json.loads(line)
if data['category_id'] == 6014:
result.append(data)
df = pd.DataFrame(result)

How to Import Multiple excel file in PandasDataframe

I cannot load multiple excel files from a directory in only one Dataframe.
I have tried two different ways and both do no work.
Gives me this error.
How can I solve the problem? It does find the files when creates the list, but than cannot open it in the Dataframe.
Any hints ?
import pandas as pd
import os
import glob
import xlrd
cwd = os.getcwd()
cwd
path = '/Users/giovanni/Desktop/news media'
files = os.listdir(path)
files
files_xls = [f for f in files if f[-3:] == 'lsx']
files_xls
df = pd.DataFrame()
for f in files_xls:
data = pd.read_excel(f)
df = df.append(data)
FileNotFoundError: [Errno 2] No such file or directory: 'NOV.xlsx'
Try this:
import os
import glob
path = '/Users/giovanni/Desktop/news media'
df = pd.DataFrame()
for file in glob.glob(os.path.join(path,'*.xlsx')):
data = pd.read_excel(file)
print(data)
df = df.append(data)
Replace your final loop with:
for f in files_xls:
full_path = os.path.join(path, f)
data = pd.read_excel(full_path)
df = df.append(data)

Categories