Pydoop stucks on readline from HDFS files - python

I am reading first line of all the files in a directory, on local it works fine but on EMR this test is failing at stuck at around 200-300th file.
Also ps -eLF show increase of childs to 3000 even print in on 200th line.
It this some bug on EMR to read max bytes?
pydoop version
pydoop==0.12.0
import os
import sys
import shutil
import codecs
import pydoop.hdfs as hdfs
def prepare_data(hdfs_folder):
folder = "test_folder"
copies_count = 700
src_file = "file"
#1) create a folder
if os.path.exists(folder):
shutil.rmtree(folder)
os.makedirs(folder)
#2) create XXX copies of file in folder
for x in range(0, copies_count):
shutil.copyfile(src_file, folder+"/"+src_file+"_"+str(x))
#3) copy folder to hdfs
#hadoop fs -copyFromLocal test_folder/ /maaz/test_aa
remove_command = "hadoop fs -rmr "+ hdfs_folder
print remove_command
os.system(remove_command)
command = "hadoop fs -copyFromLocal "+folder+" "+ hdfs_folder
print command
os.system(command)
def main(hdfs_folder):
try:
conn_hdfs = hdfs.fs.hdfs()
if conn_hdfs.exists(hdfs_folder):
items_list = conn_hdfs.list_directory(hdfs_folder)
for item in items_list:
if not item["kind"] == "file":
continue
file_name = item["name"]
print "validating file : %s" % file_name
try:
file_handle = conn_hdfs.open_file(file_name)
file_line = file_handle.readline()
print file_line
file_handle.close()
except Exception as exp:
print '####Exception \'%s\' in reading file %s' % (str(exp), file_name)
file_handle.close()
continue
conn_hdfs.close()
except Exception as e:
print "####Exception \'%s\' in validating files!" % str(e)
if __name__ == '__main__':
hdfs_path = '/abc/xyz'
prepare_data(hdfs_path)
main(hdfs_path)

I suggest using the subprocess module for reading the first line instead of pydoop's conn_hdfs.open_file
import subprocess
cmd='hadoop fs -cat {f}|head -1'.format(f=file_name)
process=subprocess.Popen(cmd, shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
stdout, stderr=process.communicate()
if stderr!='':
file_line=stdout.split('\n')[0]
else:
print "####Exception '{e}' in reading file {f}".format(f=file_name,e=stdout)
continue

Related

Json split python larger file memory issues

Am new to python. I am trying to split the json into multiple files based on the objects in the json file. Each objects goes as each unique file. The script i got is below, which does the job perfect but having memory issues, when we run huge volume of files (1000 files) each 10 MB and has around 8k objects.
#!/usr/bin/python
import json
import os
import shutil
import time
import sys
import commands
import pwd
from collections import OrderedDict
b = commands.getstatusoutput('ps -ef | grep splitjson.py | wc -l')
c = int(b[1])
if c > 3:
print c
else:
print c
path = '/home/subhome/json2/' # Get current working directory
kpath = '/home/subhome/jskaf/' # Destination path
jepath = '/home/subhome/jskaf/err/' # The error path to move the not well formated json file
apath = '/home/subhome/jskaf/arch/' # The archive path of all the files.
direc = os.listdir(path)
print(path)
# Iterate over files in directory
for f in direc:
name,ext = os.path.splitext(f)
a=f.split('.json')[0]
obpath=path + f
print obpath
kfpath=kpath + a
jerrpath=jepath + a
arcpath=apath + a
with open(obpath) as fl:
try:
#data2=json.loads(fl.read())
docs = json.load(fl,object_pairs_hook=OrderedDict)
for ii, doc in enumerate(docs):
with open(kfpath+'.{}.json'.format(ii), 'w') as out:
outflname=kfpath+'.'+str(ii)
json.dump(doc, out, indent=2)
shutil.copy(obpath,arcpath)
os.remove(obpath)
except ValueError as e:
print("An exception occurred")
errdata = str(e)
print(e)
shutil.copy(obpath,jerrpath)
os.remove(obpath)

Not able to download files from FTP

I am trying to download files using python script from my ftp server...However i am getting the files which are of size 0 kb...i can't understand exactly where i am wrong...I am actually searching the files by a particular string in filename and then downloading all the files having that string on my ftp in a given directory.
Here is my code:
# Libraries
import re
import os
import ftplib
import ntpath
ftp = ftplib.FTP("192.168.1.786:22")
ftp.login("Marshmellow", "YourPasswordHere")
##ftp.dir("feed_1")
files = []
## F = open('Files.txt','a')
try:
files = ftp.nlst("feed_1")
for fname in files:
res = re.findall("2018-07-25", fname)
if res:
# Open the file for writing in binary mode
print 'Opening local file ' + ntpath.basename(fname)
file = open(ntpath.basename(fname), 'wb')
# Download the file a chunk at a time
# Each chunk is sent to handleDownload
# We append the chunk to the file and then print a '.' for progress
# RETR is an FTP command
print 'Getting ' + ntpath.basename(fname)
try:
ftp.retrbinary('RETR ' + ntpath.basename(fname), file.write)
except:
pass
# Clean up time
print 'Closing file ' + ntpath.basename(fname)
file.close()
print (fname)
## F.write(fname + '\n')
if not res:
continue
except ftplib.error_perm , resp:
if str(resp) == "550 No files found":
print "No files in this directory"
pass
else:
raise
## F.close()
Help Me Out if anyone knows what's wrong in this.
try:
ftp.cwd("feed_1")
files = ftp.nlst() for fname in files:
res = re.findall("2018-07-25", fname) if res:
# Open the file for writing in binary mode
print 'Opening local file ' + ntpath.basename(fname)
file = open(ntpath.basename(fname), 'wb')
i've just set the current working directory using ftp.cwd("feed_1") which i did the wrong way earlier like: files = ftp.nlst("feed_1")

cPickle : stack underflow error

I have a directory of 170,000+ pickle files in multiple subdirectories which were originally pickled using the (protocol=0) format. This hasn't been very efficient time or space-wise.
I wrote a script to re-pickle (using cPickle, protocol=2) each file in the folder(s) but curiously, the script throws an exception while processing a particular file (file # 95,000).
Initially, I thought that the pickle file is corrupted. When I try to load this exact pickle file from IPython command line, the file loads just fine.
So, I'm dumbfounded as to why this happens. Here's my script and I appreciate help:
import os
import cPickle
import numpy
import time
import re
from progressbar import ProgressBar
inpath = '/path/to/folder'
def list_files(dir):
r = []
subdirs = [x[0] for x in os.walk(dir)]
for subdir in subdirs:
files = os.walk(subdir).next()[2]
if (len(files) > 0):
for file in files:
r.append(subdir + "/" + file)
return r
infileList = list_files(inpath)
print "Total number of files found: %d" % len(infileList)
print "\n\n"
progress = ProgressBar()
outfilename = " "
print "Processing pickle files. Pls wait..."
t0 = time.time()
filecount = 0
for file in progress(infileList):
try:
arr = cPickle.load(open(file , "rb" ))
outfilename = re.sub('/initial/path/','/new/path/',file)
if not os.path.exists(os.path.dirname(outfilename)):
os.makedirs(os.path.dirname(outfilename))
with open(outfilename, "wb") as f:
cPickle.dump(arr,f,protocol=2)
filecount = filecount + 1
except Exception,e:
print "\n" + str(filecount)
print "\nError occured while processing file: " + outfilename
tx = time.time()
print "\n Time elapsed: %.2f" % (tx-t0)
continue
t1 = time.time()
total = t1-t0
print "Files repickled with protocol=2.\nRepickling execution time: %.2f sec" % total

Python create sqlite DB

I have the current code for a sqlite db creation:
import storage
import os
import audiotools
def store_dir(d):
store = storage.HashStore()
for root, bar, files in os.walk(d):
for filename in files:
filename = root + '/' + filename
try:
store.store_file(filename)
print ("Stored %s% filename")
except audiotools.UnsupportedFile:
print ('Skipping unsupported file %s') % filename
except Exception, e:
print (e)
def main():
d = input('Enter the path to the music directory: ')
store_dir(d)
print ("Done.")
if __name__ == '__main__':
main()
When this code runs I get a syntax error msg. Please help !
Thanks in advance
There are a few things to address here.
First, this line:
print ('Skipping unsupported file %s') % filename
needs to be this:
print ('Skipping unsupported file %s' % filename)
Second, you need to use raw_input here:
d = input('Enter the path to the music directory: ')
which returns a string object, instead of input, which evaluates input as real Python code.
Third, your indentation is off. I'm pretty sure this is just a SO formatting error though.
Finally, you should use os.path.join here:
filename = root + '/' + filename
That isn't an error though, just a tip.
All in all, your code should look like this:
import storage
import os
import audiotools
def store_dir(d):
store = storage.HashStore()
for root, bar, files in os.walk(d):
for filename in files:
filename = os.path.join(root, filename)
try:
store.store_file(filename)
print ("Stored %s% filename")
except audiotools.UnsupportedFile:
print ('Skipping unsupported file %s' % filename)
except Exception, e:
print (e)
def main():
d = raw_input('Enter the path to the music directory: ')
store_dir(d)
print ("Done.")
if __name__ == '__main__':
main()

Python Sub process call with filename variable

I've got a small script with monitors when files are added or removed to a directory. The next step is for me to get the script to execute the files (windows batch files) once they’ve been added to the directory. I’m struggling to understand how to use a variable with subprocess call (if this is the best way this can be acheived). Could anyone help me please? Many thanks. Code looks like this so far ;
import sys
import time
import os
inputdir = 'c:\\test\\'
os.chdir(inputdir)
contents = os.listdir(inputdir)
count = len(inputdir)
dirmtime = os.stat(inputdir).st_mtime
while True:
newmtime = os.stat(inputdir).st_mtime
if newmtime != dirmtime:
dirmtime = newmtime
newcontents = os.listdir(inputdir)
added = set(newcontents).difference(contents)
if added:
print "These files added: %s" %(" ".join(added))
import subprocess
subprocess.call(%,shell=True)
removed = set(contents).difference(newcontents)
if removed:
print "These files removed: %s" %(" ".join(removed))
contents = newcontents
time.sleep(15)
This should do what you wanted, cleaned it up a little.
import sys
import time
import os
import subprocess
def monitor_execute(directory):
dir_contents = os.listdir(directory)
last_modified = os.stat(directory).st_mtime
while True:
time.sleep(15)
modified = os.stat(directory).st_mtime
if last_modified == modified:
continue
last_modified = modified
current_contents = os.listdir(directory)
new_files = set(current_contents).difference(dir_contents)
if new_files:
print 'Found new files: %s' % ' '.join(new_files)
for new_file in new_files:
subprocess.call(new_file, shell=True)
lost_files = set(dir_contents).difference(current_contents)
if lost_files:
print 'Lost these files: %s' % ' '.join(lost_files)
dir_contents = current_contents

Categories