Trying to get output of ffprobe into variable

Trying to get output of ffprobe into variable - python

I am trying to grab the ffprobe values from the video file into a variable that I can compare against others or move the value into a database. The question I have; Is there a better way of doing it than below?
I don't like the multiple if/elif/line.startswith statements and I am not sure of split is the best way of getting the ffprobe values?
#!/usr/bin/python
import os, sys, subprocess, shlex, re, fnmatch
from subprocess import call
videoDrop_dir="/mnt/VoigtKampff/Temp/_Jonatha/test_drop"
for r,d,f in os.walk(videoDrop_dir):
for files in f:
print "Files: %s" % files
if files.startswith(('._', '.')):
print "This file: %s is not valid" % files
elif files.endswith(('.mov', '.mpg', '.mp4', '.wmv', '.mxf')):
fpath = os.path.join(r, files)
def probe_file(fpath):
cmnd = ['ffprobe', '-show_format', '-show_streams', '-pretty', '-loglevel', 'quiet', fpath]
p = subprocess.Popen(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print files
out, err = p.communicate()
print "===============================OUTPUT START: %s ===============================" % files
print out
for line in out.split('\n'):
line = line.strip()
if line.startswith('codec_name='):
s = line
codec_name = s.split('codec_name=', 1)
print "Codec is: %s" % codec_name[1]
codec_1 = codec_name[1]
elif line.startswith('codec_type='):
s = line
codec_type = s.split('codec_type=', 1)
print "Codec type is: %s" % codec_type[1]
codec_type1 = codec_type[1]
elif line.startswith('codec_long_name=', 1):
s = line
codec_long_name = s.split('codec_long_name=', 1)
print "Codec long name: %s" % codec_long_name[1]
codec_long_name = codec_long_name[1]
elif line.startswith('format_long_name=', 1):
s = line
format_long_name = s.split('format_long_name=', 1)
print "Format long name: %s" % format_long_name[1]
format_long_name = format_long_name[1]
elif line.startswith('width='):
s = line
width = s.split('width=', 1)
print "Video pixel width is: %s" % width[1]
p_width = width[1]
elif line.startswith('height='):
s = line
height = s.split('height=', 1)
print "Video pixel height is: %s" % height[1]
p_height = height[1]
elif line.startswith('bit_rate='):
s = line
bit_rate = s.split('bit_rate=', 1)
print "Bit rate is: %s" % bit_rate[1]
bit_rate1 = bit_rate[1]
elif line.startswith('display_aspect_ratio=', 1):
s = line
display_aspect_ratio = s.split('display_aspect_ratio=', 1)
print "Display aspect ratio: %s" % display_aspect_ratio[1]
display_aspect_ratio1 = display_aspect_ratio[1]
elif line.startswith('avg_frame_rate=', 1):
s = line
avg_frame_rate = s.split('avg_frame_rate=', 1)
print "Average Frame Rate: %s" % avg_frame_rate[1]
avg_frame_rate1 = avg_frame_rate[1]
print "===============================OUTPUT FINISH: %s ===============================" % files
if err:
print "===============================ERROR: %s ===============================" % files
print err
probe_file(fpath)
else:
if not files.startswith(('.mov', '.mpg', '.mp4', '.wmv', '.mxf')):
print "This file: %s is not a valid video file" % files

This is a bit late, but hopefully it helps others searching for a similar answer.
import json, subprocess
# grab info about video_file
ffprobe_cmd = '/home/ubuntu/bin/ffprobe -v quiet -print_format json -show_format -show_streams - i ' + v + ' 2>&1'
# print ffprobe_cmd
s = subprocess.Popen(ffprobe_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
ffprobe_out, err = s.communicate()
ffprobe_dict = json.loads(ffprobe_out)
From here, I re-use a common method, search_dict, which can be used like:
search_dict(ffprobe_dict, 'height')
def search_dict(my_dict, field):
"""Takes a dict with nested lists and dicts,
and searches all dicts for a key of the field
provided.
"""
fields_found = []
for key, value in my_dict.iteritems():
if key == field:
fields_found.append(value)
elif isinstance(value, dict):
results = search_dict(value, field)
for result in results:
fields_found.append(result)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
more_results = search_dict(item, field)
for another_result in more_results:
fields_found.append(another_result)
return fields_found

You should ask this question on https://codereview.stackexchange.com/

Related

Python simple Multi-threaded Downloader file corrupted

This is my first post. I have been doing python programming for quite sometime and recently was working on a multi-threaded downloader. But the problem is that my file (jpg is my target ) gets corrupted . Also with the followinf input : http://www.aumathletics.com/images_web/headerAUMLogo.jpg
it shows error
while with the input :
http://www.nasa.gov/images/content/607800main_kepler1200_1600-1200.jpg
the file gets corrupted.
Here is the code:-
import os, sys, requests
import threading
import urllib2
import time
URL = sys.argv[1]
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
if i == 0:
lst.append('%s-%s' % (i, int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
else:
lst.append('%s-%s' % (int(round(1 + i * value/(numsplits*1.0),0)), int(round(1 + i * value/(numsplits*1.0) + value/(numsplits*1.0)-1, 0))))
return lst
def main(url=None, splitBy=5):
start_time = time.time()
if not url:
print "Please Enter some url to begin download."
return
fileName = "image.jpg"
sizeInBytes = requests.head(url, headers={'Accept-Encoding': 'identity'}).headers.get('content-length', None)
print "%s bytes to download." % sizeInBytes
if not sizeInBytes:
print "Size cannot be determined."
return
dataDict = {}
# split total num bytes into ranges
ranges = buildRange(int(sizeInBytes), splitBy)
def downloadChunk(idx, irange):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes={}'.format(irange)
dataDict[idx] = urllib2.urlopen(req).read()
# create one downloading thread per chunk
downloaders = [
threading.Thread(
target=downloadChunk,
args=(idx, irange),
)
for idx,irange in enumerate(ranges)
]
# start threads, let run in parallel, wait for all to finish
for th in downloaders:
th.start()
for th in downloaders:
th.join()
print 'done: got {} chunks, total {} bytes'.format(
len(dataDict), sum( (
len(chunk) for chunk in dataDict.values()
) )
)
print "--- %s seconds ---" % str(time.time() - start_time)
if os.path.exists(fileName):
os.remove(fileName)
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
if __name__ == '__main__':
main(URL)
the indentation here might be wrong so here is the code pastebin(dot)com/wGEkp878
I would be very grateful if someone could point the error
EDIT: suggested by a guy
def buildRange(value, numsplits):
lst = []
for i in range(numsplits):
first = i if i == 0 else buildRange().start(i, value, numsplits)
second = buildRange().end(i, value, numsplits)
lst.append("{}-{}".format(first, second))
return lst
can anyone tell me hoe to keep the part files downloaded with names like part1 part2 so on

It turns out the file must be opened in binary mode, with 'wb' instead of 'w'. If opened with just 'w' a bunch of extra characters will be written. This has something to do with derpy windows vs. linux new line semantics. If you use 'wb' it will write exactly what you put in into the file.
EDIT:
If you want to store the individual file parts you can change
# reassemble file in correct order
with open(fileName, 'w') as fh:
for _idx,chunk in sorted(dataDict.iteritems()):
fh.write(chunk)
print "Finished Writing file %s" % fileName
print 'file size {} bytes'.format(os.path.getsize(fileName))
To
# reassemble file in correct order
for _idx,chunk in sorted(dataDict.iteritems()):
with open(fileName + str(".part-") + str(_idx), 'wb') as fh:
fh.write(chunk)
print "Finished Writing file %s" % fileName
#print 'file size {} bytes'.format(os.path.getsize(fileName))

Python read data in as binary

I am wanting to read in the logData as binary and then parse the binary output in the second for loop as it is for a string but for binary. Is this possible?
logData = open(sys.argv[1]).readlines()
processedSources = sys.stdin.readlines()
stringDictionary = {}
for line in processedSources:
# Match data looking for MODULE_ID, LOG_LINE, ARG_COUNT, FILE_NAME, DATA_STRING
match = re.search("(\d+),\s+(\d+),\s+(\d+),\s+(.*),\s+(\".*\")", line)
if match:
moduleId = int(match.group(1))
logLine = int(match.group(2))
argCount = int(match.group(3))
fileName = match.group(4)
outputString = match.group(5)
stringDictionary[(moduleId, logLine)] = [ moduleId, logLine, argCount, fileName, outputString ]
else:
print "Failed string dictionary on: " + line
for line in logData:
# Match data looking for MODULE_ID, LOG_LINE, ARG_COUNT, ARGUMENTS
matchLogData = re.split("\s+", line)
if matchLogData:
moduleId = int(matchLogData[0], 16)
logLine = int(matchLogData[1], 16)
argCount = int(matchLogData[2], 16)
if stringDictionary[(moduleId, logLine)]:
processedData = stringDictionary[(moduleId, logLine)]
if argCount != processedData[2]:
print "Argument count mismatch on : " + line
print " expected %d found %d" % (argCount, processedData[2])
else:
index = 0
logString = "%02x:%4d:%s:" + processedData[4]
logData = (processedData[0], processedData[1], processedData[3])
while index < argCount:
logData = logData + (int(matchLogData[index+3], 16),)
index = index + 1
print logString % logData
else:
print "ModuleId:%d Line:%d, not found in source dictionary" % (moduleId, logLine)
print " Line data: " + line
else:
print "Expected log input data mismatch MODULE_ID LOG_LINE ARG_COUNT ARGS"
print "Line: " + line

Reloading a python file once a minute

I have an IRC bot and I'm parsing data, but to make it refresh I have to reload the plugin.I've noticed it works once I change something on the file or just open and save it again, once it reloads it gets the right info. That's the file that reloads the plugin:
reload.py
import collections
import glob
import os
import re
import sys
import traceback
if 'mtimes' not in globals():
mtimes = {}
if 'lastfiles' not in globals():
lastfiles = set()
def make_signature(f):
return f.func_code.co_filename, f.func_name, f.func_code.co_firstlineno
def format_plug(plug, kind='', lpad=0, width=40):
out = ' ' * lpad + '%s:%s:%s' % make_signature(plug[0])
if kind == 'command':
out += ' ' * (50 - len(out)) + plug[1]['name']
if kind == 'event':
out += ' ' * (50 - len(out)) + ', '.join(plug[1]['events'])
if kind == 'regex':
out += ' ' * (50 - len(out)) + plug[1]['regex']
return out
def reload(init=False):
changed = False
if init:
bot.plugs = collections.defaultdict(list)
bot.threads = {}
core_fileset = set(glob.glob(os.path.join("core", "*.py")))
for filename in core_fileset:
mtime = os.stat(filename).st_mtime
if mtime != mtimes.get(filename):
mtimes[filename] = mtime
changed = True
try:
eval(compile(open(filename, 'U').read(), filename, 'exec'),
globals())
except Exception:
traceback.print_exc()
if init: # stop if there's an error (syntax?) in a core
sys.exit() # script on startup
continue
if filename == os.path.join('core', 'reload.py'):
reload(init=init)
return
fileset = set(glob.glob(os.path.join('plugins', '*.py')))
# remove deleted/moved plugins
for name, data in bot.plugs.iteritems():
bot.plugs[name] = [x for x in data if x[0]._filename in fileset]
for filename in list(mtimes):
if filename not in fileset and filename not in core_fileset:
mtimes.pop(filename)
for func, handler in list(bot.threads.iteritems()):
if func._filename not in fileset:
handler.stop()
del bot.threads[func]
# compile new plugins
for filename in fileset:
mtime = os.stat(filename).st_mtime
if mtime != mtimes.get(filename):
mtimes[filename] = mtime
changed = True
try:
code = compile(open(filename, 'U').read(), filename, 'exec')
namespace = {}
eval(code, namespace)
except Exception:
traceback.print_exc()
continue
# remove plugins already loaded from this filename
for name, data in bot.plugs.iteritems():
bot.plugs[name] = [x for x in data
if x[0]._filename != filename]
for func, handler in list(bot.threads.iteritems()):
if func._filename == filename:
handler.stop()
del bot.threads[func]
for obj in namespace.itervalues():
if hasattr(obj, '_hook'): # check for magic
if obj._thread:
bot.threads[obj] = Handler(obj)
for type, data in obj._hook:
bot.plugs[type] += [data]
if not init:
print '### new plugin (type: %s) loaded:' % \
type, format_plug(data)
if changed:
bot.commands = {}
for plug in bot.plugs['command']:
name = plug[1]['name'].lower()
if not re.match(r'^\w+$', name):
print '### ERROR: invalid command name "%s" (%s)' % (name,
format_plug(plug))
continue
if name in bot.commands:
print "### ERROR: command '%s' already registered (%s, %s)" % \
(name, format_plug(bot.commands[name]),
format_plug(plug))
continue
bot.commands[name] = plug
bot.events = collections.defaultdict(list)
for func, args in bot.plugs['event']:
for event in args['events']:
bot.events[event].append((func, args))
if init:
print ' plugin listing:'
if bot.commands:
# hack to make commands with multiple aliases
# print nicely
print ' command:'
commands = collections.defaultdict(list)
for name, (func, args) in bot.commands.iteritems():
commands[make_signature(func)].append(name)
for sig, names in sorted(commands.iteritems()):
names.sort(key=lambda x: (-len(x), x)) # long names first
out = ' ' * 6 + '%s:%s:%s' % sig
out += ' ' * (50 - len(out)) + ', '.join(names)
print out
for kind, plugs in sorted(bot.plugs.iteritems()):
if kind == 'command':
continue
print ' %s:' % kind
for plug in plugs:
print format_plug(plug, kind=kind, lpad=6)
print
Let's say the plugin I want to reload once a minute is called flightsinfo.py. How can I do that ?

The important code looks like it's here:
mtime = os.stat(filename).st_mtime
if mtime != mtimes.get(filename):
mtimes[filename] = mtime
changed = True
try:
code = compile(open(filename, 'U').read(), filename, 'exec')
namespace = {}
eval(code, namespace)
except Exception:
traceback.print_exc()
continue
If the modification time of the file has changed (e.g. when you open and save it), then the compile/exec functionality is called.
There are a couple of ways to solve this issue, which depend on your situation:
Periodically update the mtime of the file. For example on linux you might run a cron job once a minute to touch /path/to/flightsinfo.py.
In reload.py refactor the functionality to reload to a function and call that from you python.
def reload(filename):
try:
code = compile(open(filename, 'U').read(), filename, 'exec')
namespace = {}
eval(code, namespace)
...

Can't get Python picture renaming program to work right

This program is supposed to run from command line like this:
python Filename Folder_photos_are_in New_Prefix
It should just rename the files, but it wasn't working, so I had it print out each function separately as it runs. It seems to work all right until the SortByMTime function at which time it deletes all of the files from my list except the last one.
Here is the code:
import sys
import os
import random
def filterByExtension(root, allfiles, extensions):
files = []
for f in allfiles:
hasExt = f.rfind('.')
if(hasExt > 0):
ext = f[hasExt+1::].lower()
if(ext in extensions):
f2 = os.path.join(root, f)
if(os.path.isfile(f2)):
files.append(f)
else:
print "Matching irregular file " + f
return files
def sortByMTime(root, matching):
photos = []
for f in matching:
path = os.path.join(root, f)
mtime = os.path.getmtime(path)
photos.append((mtime, f))
photos.sort()
return photos
def assignNames(prefix, inorder):
kount = str(len(inorder))
digits = len(kount)
template = '%%0%dd' % digits
newnames={}
kount = 0
for i in inorder:
kount += 1
s = template % kount
newnames [i[1]] = prefix+s+'.'+i[1].split('.')[1]
return newnames
print newnames
def makeTempName(allfiles):
r = random.randrange(0,1000000000)
name = "__temp%i__" % r
while name in allfiles:
r+=1
name = "__temp%i__" % r
return name
def makeScript(inorder, newnames, tempname):
chain = []
inthechain = {}
script = []
for i in inorder:
if i not in newnames:
continue
if newnames[i] == id:
del newnames[i]
continue
if newnames[i] not in newnames:
target = newnames[i]
script.append( (i,target) )
del newnames[i]
continue
else:
link = i
while True:
target = newnames[i]
chain.append( (link, target) )
inthechain[link] = True
link = target
if link not in newnames:
break
chain.reverse()
for (a, b) in chain:
print "This is in the chain: "
print chain
script.append(a,b)
del newnames[a]
return script
def doRenames(root, script):
for (old, new) in script:
print "%s -> %s" %(old, new)
fulloldpath=os.path.join(root, old)
fullnewpath = os.path.join(root, new)
if os.path.exists(fullnewpath):
print "File already exists"
os.exit(1)
else:
os.rename(fulloldpath, fullnewpath)
def main():
usrdir = []
allfiles = []
path = []
prefix = ''
args = sys.argv
args.pop(0) #remove first thing from list
if len(args) == 2: #Directory and Prefix are provided
print "Directory: ", args[0]
print "Prefix: ", args[1]
usrdir = args[0]
path = os.path.abspath(usrdir)
prefix = os.path.basename(path)
if len(args) == 1: #Only directory is provided
args.append(args[0]) #Makes the directory name the prefix as well
print "Directory: ", args[0]
print "Prefix: ", args[1]
usrdir = args[0]
path = os.path.abspath(usrdir)
prefix = os.path.basename(path)
if len(args) == 0 or len(args) > 2: #ends the programs because wrong number of arguments.
print "INVALID Number of Arguments:"
print "Usage: python bulkrename.py <directory> [<prefix>]"
exit(1)
allfiles = os.listdir(usrdir)
print "Printout of allfiles"
print allfiles
print
root = os.path.abspath(args[0])
print "root: ", root
print
extensions = ['jpeg', 'jpg', 'png', 'gif']
print "What Extensions should be able to be used: "
print extensions
print
matching = filterByExtension(root, allfiles, extensions)
print "What comes out of filterByExtension"
print matching
print
inorder = sortByMTime(path, matching)
print "What comes out of sortByMTime"
print inorder
print
newnames = assignNames(prefix, inorder)
print "What comes out of assignNames"
print newnames
print
tempname = makeTempName(allfiles)
print "What comes out of makeTempName"
print tempname
print
script = makeScript(inorder, newnames, tempname)
print "What comes out of makeScript"
print script
print
doRenames(path, script)
print "What comes out of doRenames"
print doRenames
print
main()
and here is the output from terminal
virus-haven:CS1410 chrislebaron$ python bulkrenamer.py bulkrename test
Directory: bulkrename
Prefix: test
Printout of allfiles
['.DS_Store', '20120902Snow_Canyon02.JPG', '20120902Snow_Canyon03.JPG', '20120902Snow_Canyon05.JPG', '20120902Snow_Canyon06.JPG', '20120902Snow_Canyon08.JPG', '20120902Snow_Canyon09.JPG', '20120902Snow_Canyon11.JPG', '20120902Snow_Canyon12.JPG', 'airplane.png', 'BackNoText.jpg', 'blah', 'FrontNoText.jpg', 'glitchbusters.jpg', 'IMG_7663.JPG', 'IMG_7664.JPG', 'Pomegranates.jpg', 'rccar.png']
root: /Users/chrislebaron/Documents/School/CS1410/bulkrename
What Extensions should be able to be used:
['jpeg', 'jpg', 'png', 'gif']
What comes out of filterByExtension
['20120902Snow_Canyon02.JPG', '20120902Snow_Canyon03.JPG', '20120902Snow_Canyon05.JPG', '20120902Snow_Canyon06.JPG', '20120902Snow_Canyon08.JPG', '20120902Snow_Canyon09.JPG', '20120902Snow_Canyon11.JPG', '20120902Snow_Canyon12.JPG', 'airplane.png', 'BackNoText.jpg', 'FrontNoText.jpg', 'glitchbusters.jpg', 'IMG_7663.JPG', 'IMG_7664.JPG', 'Pomegranates.jpg', 'rccar.png']
What comes out of sortByMTime
[(1322960835.0, 'rccar.png')]
What comes out of assignNames
{'rccar.png': 'bulkrename1.png'}
What comes out of makeTempName
__temp55210675__
What comes out of makeScript
[]
What comes out of doRenames
<function doRenames at 0x100dede60>
virus-haven:CS1410 chrislebaron$

You've goofed your indentation, mixing spaces and tabs. Use python -tt to verify.

BioPython: extracting sequence IDs from a Blast output file

I have a BLAST output file in XML format. It is 22 query sequences with 50 hits reported from each sequence. And I want to extract all the 50x22 hits. This is the code I currently have, but it only extracts the 50 hits from the first query.
from Bio.Blast import NCBIXM
blast_records = NCBIXML.parse(result_handle)
blast_record = blast_records.next()
save_file = open("/Users/jonbra/Desktop/my_fasta_seq.fasta", 'w')
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
save_file.write('>%s\n' % (alignment.title,))
save_file.close()
Somebody have any suggestions as to extract all the hits? I guess I have to use something else than alignments.
Hope this was clear. Thanks!
Jon

This should get all records. The novelty compared with the original is the
for blast_record in blast_records
which is a python idiom to iterate through items in a "list-like" object, such as the blast_records (checking the CBIXML module documentation showed that parse() indeed returns an iterator)
from Bio.Blast import NCBIXM
blast_records = NCBIXML.parse(result_handle)
save_file = open("/Users/jonbra/Desktop/my_fasta_seq.fasta", 'w')
for blast_record in blast_records:
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
save_file.write('>%s\n' % (alignment.title,))
#here possibly to output something to file, between each blast_record
save_file.close()

I used this code for extract all the results
from Bio.Blast import NCBIXML
for record in NCBIXML.parse(open("rpoD.xml")) :
print "QUERY: %s" % record.query
for align in record.alignments :
print " MATCH: %s..." % align.title[:60]
for hsp in align.hsps :
print " HSP, e=%f, from position %i to %i" \
% (hsp.expect, hsp.query_start, hsp.query_end)
if hsp.align_length < 60 :
print " Query: %s" % hsp.query
print " Match: %s" % hsp.match
print " Sbjct: %s" % hsp.sbjct
else :
print " Query: %s..." % hsp.query[:57]
print " Match: %s..." % hsp.match[:57]
print " Sbjct: %s..." % hsp.sbjct[:57]
print "Done"
or for less details
from Bio.Blast import NCBIXML
for record in NCBIXML.parse(open("NC_003197.xml")) :
#We want to ignore any queries with no search results:
if record.alignments :
print "QUERY: %s..." % record.query[:60]
for align in record.alignments :
for hsp in align.hsps :
print " %s HSP, e=%f, from position %i to %i" \
% (align.hit_id, hsp.expect, hsp.query_start, hsp.query_end)
print "Done"
I used this site
http://www2.warwick.ac.uk/fac/sci/moac/currentstudents/peter_cock/python/rpsblast/

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Trying to get output of ffprobe into variable - python

You should ask this question on https://codereview.stackexchange.com/

Related

Python simple Multi-threaded Downloader file corrupted

Python read data in as binary

Reloading a python file once a minute

Can't get Python picture renaming program to work right

BioPython: extracting sequence IDs from a Blast output file

Categories

Resources