Json split python larger file memory issues - python

Am new to python. I am trying to split the json into multiple files based on the objects in the json file. Each objects goes as each unique file. The script i got is below, which does the job perfect but having memory issues, when we run huge volume of files (1000 files) each 10 MB and has around 8k objects.
#!/usr/bin/python
import json
import os
import shutil
import time
import sys
import commands
import pwd
from collections import OrderedDict
b = commands.getstatusoutput('ps -ef | grep splitjson.py | wc -l')
c = int(b[1])
if c > 3:
print c
else:
print c
path = '/home/subhome/json2/' # Get current working directory
kpath = '/home/subhome/jskaf/' # Destination path
jepath = '/home/subhome/jskaf/err/' # The error path to move the not well formated json file
apath = '/home/subhome/jskaf/arch/' # The archive path of all the files.
direc = os.listdir(path)
print(path)
# Iterate over files in directory
for f in direc:
name,ext = os.path.splitext(f)
a=f.split('.json')[0]
obpath=path + f
print obpath
kfpath=kpath + a
jerrpath=jepath + a
arcpath=apath + a
with open(obpath) as fl:
try:
#data2=json.loads(fl.read())
docs = json.load(fl,object_pairs_hook=OrderedDict)
for ii, doc in enumerate(docs):
with open(kfpath+'.{}.json'.format(ii), 'w') as out:
outflname=kfpath+'.'+str(ii)
json.dump(doc, out, indent=2)
shutil.copy(obpath,arcpath)
os.remove(obpath)
except ValueError as e:
print("An exception occurred")
errdata = str(e)
print(e)
shutil.copy(obpath,jerrpath)
os.remove(obpath)

Related

While obtaining hash files, some folders and files from the directory are not showing up

My code was working just fine before adding the hash function. I was getting the list of all folders and files in my directory in the Pretty Table. Once I added the hash function, I got maybe 5 of the files in that directory with hashes in the table. I am not sure where I have gone wrong. Please forgive me, I am new to this. We are not learning to code from scratch, but have to modify existing codes to function the way we need it to.
# Python Standard Libaries
import os #file system methode
import hashlib #hashing function
import sys #system methods
import time #time conversions
# Python 3rd Party Libraries
from prettytable import PrettyTable # pip install prettytable
# Local Functions
def GetFileMetaData(fileName):
#obtain file system metadata
try:
metaData = os.stat(fileName) # Use the stat method to obtain meta data
fileSize = metaData.st_size # Extract fileSize and MAC Times
timeLastAccess = metaData.st_atime
timeLastModified = metaData.st_mtime
timeCreated = metaData.st_ctime
macTimeList = [timeLastModified, timeCreated, timeLastAccess] # Group the MAC Times in a List
return True, None, fileSize, macTimeList
except Exception as err:
return False, str(err), None, None
# Psuedo Constants
# Start of the Script
tbl = PrettyTable(['FilePath','FileSize','UTC-Modified', 'UTC-Accessed', 'UTC-Created', 'SHA-256 HASH'])
#file check
while True:
targetFolder = input("Enter Target Folder: ")
if os.path.isdir(targetFolder):
break
else:
print("\nInvalid Folder ... Please Try Again")
print("Walking: ", targetFolder, "\n")
print()
for currentRoot, dirList, fileList in os.walk(targetFolder):
for nextFile in fileList:
fullPath = os.path.join(currentRoot, nextFile)
absPath = os.path.abspath(fullPath)
fileSize = os.path.getsize(absPath)
success, errInfo, fileSize, macList = GetFileMetaData(absPath)
if success:
#convert to readable Greenich Time
modTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[0]))
accTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[1]))
creTime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(macList[2]))
#hashing function
with open(absPath, 'rb') as target:
fileContents = target.read()
sha256Obj = hashlib.sha256()
sha256Obj.update(fileContents)
hexDigest = sha256Obj.hexdigest()
tbl.add_row( [ absPath, fileSize,modTime, accTime, creTime, hexDigest] )
tbl.align = "l" # align the columns left justified
# display the table
print (tbl.get_string(sortby="FileSize", reversesort=True))
print("\nScript-End\n")

Make os.walk/os.stat ignore permission denied?

I have made the below Python 2.4 script that scans the files on a number of LINUX disks and returns their path and stats from os.stat in human readable format.
#!/usr/bin/env python
from datetime import datetime as dt
import os
import grp
import pwd
locations = ["/disk1", "/disk2", "/disk3", "/disk4", "/disk5", "/disk6"]
for item in locations:
for root, dirs, files in os.walk(item):
for fn in files:
path = os.path.join(root, fn)
stats = os.lstat(path)
size = str(stats.st_size)
user = pwd.getpwuid(stats.st_uid)[0]
group = grp.getgrgid(stats.st_gid)[0]
laccess = dt.fromtimestamp(stats.st_atime).strftime("%Y-%m-%d,%H:%M:%S")
lmod = dt.fromtimestamp(stats.st_mtime).strftime("%Y-%m-%d,%H:%M:%S")
c = ","
t1 = ["0,", path, c, size, c, user, c, group, c, laccess, c, lmod,"\n"]
outfile = open("/data/promigrate/DiskDashboard/Temp/filescan.csv", "at")
outfile.write("".join(t1))
outfile.close()
However I have discovered that I do not have access to some of the files on the system.
I get the below error:
abc12bmm -34> python files_scan.py
Traceback (most recent call last):
File "files_scan.py", line 24, in ?
stats = os.lstat(path)
OSError: [Errno 13] Permission denied: '/disk1/file5'
Is it possible to handle the error in such a way that the process skips the permission denied error and continues to the next file?
I have been playing around with exceptions etc but no joy, so any help would be greatly appreciated.
Thanks!
Justin
The issue was caused by the os.lstat not having permissions to run on the file, I had been using the try-except in the wrong place. To handle the error the try and except should be added as below.
#!/usr/bin/env python
from datetime import datetime as dt
import os
import grp
import pwd
locations = ["/disk1", "/disk2", "/disk3", "/disk4", "/disk5", "/disk6"]
for item in locations:
for root, dirs, files in os.walk(item):
for fn in files:
path = os.path.join(root, fn)
try:
stats = os.lstat(path)
size = str(stats.st_size)
user = pwd.getpwuid(stats.st_uid)[0]
group = grp.getgrgid(stats.st_gid)[0]
laccess = dt.fromtimestamp(stats.st_atime).strftime("%Y-%m-%d,%H:%M:%S")
lmod = dt.fromtimestamp(stats.st_mtime).strftime("%Y-%m-%d,%H:%M:%S")
c = ","
t1 = ["0,", path, c, size, c, user, c, group, c, laccess, c, lmod,"\n"]
outfile = open("/data/promigrate/DiskDashboard/Temp/filescan.csv", "at")
outfile.write("".join(t1))
outfile.close()
except OSError:
pass
This now handles the error and continues running.
Thanks for all your help!

Pydoop stucks on readline from HDFS files

I am reading first line of all the files in a directory, on local it works fine but on EMR this test is failing at stuck at around 200-300th file.
Also ps -eLF show increase of childs to 3000 even print in on 200th line.
It this some bug on EMR to read max bytes?
pydoop version
pydoop==0.12.0
import os
import sys
import shutil
import codecs
import pydoop.hdfs as hdfs
def prepare_data(hdfs_folder):
folder = "test_folder"
copies_count = 700
src_file = "file"
#1) create a folder
if os.path.exists(folder):
shutil.rmtree(folder)
os.makedirs(folder)
#2) create XXX copies of file in folder
for x in range(0, copies_count):
shutil.copyfile(src_file, folder+"/"+src_file+"_"+str(x))
#3) copy folder to hdfs
#hadoop fs -copyFromLocal test_folder/ /maaz/test_aa
remove_command = "hadoop fs -rmr "+ hdfs_folder
print remove_command
os.system(remove_command)
command = "hadoop fs -copyFromLocal "+folder+" "+ hdfs_folder
print command
os.system(command)
def main(hdfs_folder):
try:
conn_hdfs = hdfs.fs.hdfs()
if conn_hdfs.exists(hdfs_folder):
items_list = conn_hdfs.list_directory(hdfs_folder)
for item in items_list:
if not item["kind"] == "file":
continue
file_name = item["name"]
print "validating file : %s" % file_name
try:
file_handle = conn_hdfs.open_file(file_name)
file_line = file_handle.readline()
print file_line
file_handle.close()
except Exception as exp:
print '####Exception \'%s\' in reading file %s' % (str(exp), file_name)
file_handle.close()
continue
conn_hdfs.close()
except Exception as e:
print "####Exception \'%s\' in validating files!" % str(e)
if __name__ == '__main__':
hdfs_path = '/abc/xyz'
prepare_data(hdfs_path)
main(hdfs_path)
I suggest using the subprocess module for reading the first line instead of pydoop's conn_hdfs.open_file
import subprocess
cmd='hadoop fs -cat {f}|head -1'.format(f=file_name)
process=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
stdout, stderr=process.communicate()
if stderr!='':
file_line=stdout.split('\n')[0]
else:
print "####Exception '{e}' in reading file {f}".format(f=file_name,e=stdout)
continue

Script that reads PDF metadata and writes to CSV

I wrote a script to read PDF metadata to ease a task at work. The current working version is not very usable in the long run:
from pyPdf import PdfFileReader
BASEDIR = ''
PDFFiles = []
def extractor():
output = open('windoutput.txt', 'r+')
for file in PDFFiles:
try:
pdf_toread = PdfFileReader(open(BASEDIR + file, 'r'))
pdf_info = pdf_toread.getDocumentInfo()
#print str(pdf_info) #print full metadata if you want
x = file + "~" + pdf_info['/Title'] + " ~ " + pdf_info['/Subject']
print x
output.write(x + '\n')
except:
x = file + '~' + ' ERROR: Data missing or corrupt'
print x
output.write(x + '\n')
pass
output.close()
if __name__ == "__main__":
extractor()
Currently, as you can see, I have to manually input the working directory and manually populate the list of PDF files. It also just prints out the data in the terminal in a format that I can copy/paste/separate into a spreadsheet.
I'd like the script to work automatically in whichever directory I throw it in and populate a CSV file for easier use. So far:
from pyPdf import PdfFileReader
import csv
import os
def extractor():
basedir = os.getcwd()
extension = '.pdf'
pdffiles = [filter(lambda x: x.endswith('.pdf'), os.listdir(basedir))]
with open('pdfmetadata.csv', 'wb') as csvfile:
for f in pdffiles:
try:
pdf_to_read = PdfFileReader(open(f, 'r'))
pdf_info = pdf_to_read.getDocumentInfo()
title = pdf_info['/Title']
subject = pdf_info['/Subject']
csvfile.writerow([file, title, subject])
print 'Metadata for %s written successfully.' % (f)
except:
print 'ERROR reading file %s.' % (f)
#output.writerow(x + '\n')
pass
if __name__ == "__main__":
extractor()
In its current state it seems to just prints a single error (as in, the error message in the exception, not an error returned by Python) message and then stop. I've been staring at it for a while and I'm not really sure where to go from here. Can anyone point me in the right direction?
writerow([file, title, subject]) should be writerow([f, title, subject])
You can use sys.exc_info() to print the details of your error
http://docs.python.org/2/library/sys.html#sys.exc_info
Did you check the pdffiles variable contains what you think it does? I was getting a list inside a list... so maybe try:
for files in pdffiles:
for f in files:
#do stuff with f
I personally like glob. Notice I add * before the .pdf in the extension variable:
import os
import glob
basedir = os.getcwd()
extension = '*.pdf'
pdffiles = glob.glob(os.path.join(basedir,extension)))
Figured it out. The script I used to download the files was saving the files with '\r\n' trailing after the file name, which I didn't notice until I actually ls'd the directory to see what was up. Thanks for everyone's help.

Extract ZipFile using Python, display Progress Percentage?

I know how to extract a zip archive using Python, but how exactly do I display the progress of that extraction in a percentage?
I suggest using tqdm, you can install it using pip like so:
pip install tqdm
Then, you can use it directly like so:
>>> from tqdm import tqdm
>>>
>>> with zipfile.ZipFile(some_source) as zf:
... for member in tqdm(zf.infolist(), desc='Extracting '):
... try:
... zf.extract(member, target_path)
... except zipfile.error as e:
... pass
This will produce something like so:
Extracting : 100%|██████████| 60.0k/60.0k [14:56<00:00, 66.9File/s]
the extract method doesn't provide a call back for this so one would have to use getinfo to get the e uncompressed size and then open the file read from it in blocks and write it to the place you want the file to go and update the percentage one would also have to restore the mtime if that is wanted an example:
import zipfile
z = zipfile.ZipFile(some_source)
entry_info = z.getinfo(entry_name)
i = z.open(entry_name)
o = open(target_name, 'w')
offset = 0
while True:
b = i.read(block_size)
offset += len(b)
set_percentage(float(offset)/float(entry_info.file_size) * 100.)
if b == '':
break
o.write(b)
i.close()
o.close()
set_attributes_from(entry_info)
this extracts entry_name to target_name
most of this is also done by shutil.copyfileobj but it doesn't have a call back for progress either
the source of the ZipFile.extract method calls _extract_member uses:
source = self.open(member, pwd=pwd)
target = file(targetpath, "wb")
shutil.copyfileobj(source, target)
source.close()
target.close()
where member has be converted from a name to a ZipInfo object by getinfo(member) if it wasn't a ZipInfo object
Sorry a bit late seeing this. Had a similar problem, needing an equivalent to zipfile.Zipfile.extractall. If you have tqdm>=4.40.0 (which I released over a year ago), then:
from os import fspath
from pathlib import Path
from shutil import copyfileobj
from zipfile import ZipFile
from tqdm.auto import tqdm # could use from tqdm.gui import tqdm
from tqdm.utils import CallbackIOWrapper
def extractall(fzip, dest, desc="Extracting"):
"""zipfile.Zipfile(fzip).extractall(dest) with progress"""
dest = Path(dest).expanduser()
with ZipFile(fzip) as zipf, tqdm(
desc=desc, unit="B", unit_scale=True, unit_divisor=1024,
total=sum(getattr(i, "file_size", 0) for i in zipf.infolist()),
) as pbar:
for i in zipf.infolist():
if not getattr(i, "file_size", 0): # directory
zipf.extract(i, fspath(dest))
else:
with zipf.open(i) as fi, open(fspath(dest / i.filename), "wb") as fo:
copyfileobj(CallbackIOWrapper(pbar.update, fi), fo)
For the lazy, below is a self-contained working example based on Dan D's answer. Tested on Python 3.10.6. Not optimized, but works.
In this example, the assumption is that the target "test" directory exists, but you can of course create it in the extract function.
The advantage of Dan's answer over most of the answers I've seen for this topic is that showing progress each time a file from the archive is processed does not achieve the goal if the archive consists of very large files.
import zipfile
import os
from pathlib import Path
def extract(zip_path, target_path):
block_size = 8192
z = zipfile.ZipFile(zip_path)
for entry_name in z.namelist():
entry_info = z.getinfo(entry_name)
i = z.open(entry_name)
print(entry_name)
if entry_name[-1] != '/':
dir_name = os.path.dirname(entry_name)
p = Path(f"{target_path}/{dir_name}")
p.mkdir(parents=True, exist_ok=True)
o = open(f"{target_path}/{entry_name}", 'wb')
offset = 0
while True:
b = i.read(block_size)
offset += len(b)
print(float(offset)/float(entry_info.file_size) * 100.)
if b == b'':
break
o.write(b)
o.close()
i.close()
z.close()
extract("test.zip", "test")
import zipfile
srcZipFile = 'srcZipFile.zip'
distZipFile = 'distZipFile'
with zipfile.ZipFile(srcZipFile) as zf:
filesList = zf.namelist()
for idx, file in enumerate(filesList):
percent = round((idx / len(filesList))*100)
print(percent)
zf.extract(file, distZipFile)
zf.close()

Categories