Invalid syntax hadoop streaming error - python

I am trying to run a Hadoop streaming Python job:
/home/hduser/hadoop/bin/hadoop jar /home/hduser/hadoop/share/hadoop/tools/lib/hadoop-*streaming*.jar -file audio.py -cacheFile hdfs://localhost:54310/user/hduser/fpcalc#fpcalc -input /user/hduser/audio/input -output /user/hduser/audio/output -mapper $cwd/audio.py -cmdenv AUDIO_DIR=/user/hduser/audio/input/ -verbose
I get "invalid syntax" at line 183 of audio.py (it only has 182 lines).
It seems that Hadoop is not handling the audio.py file in the way I want (I think!). I've tried removing the $cwd above and putting the name in quotes and pointing at the pyc file instead and using "python audio.py" but nothing helps. i have copied fpcalc to Hadoop.
Any help is much appreciated!
audio.py:
#!/usr/bin/env python
# Adapted from http://www.randombytes.org/audio_comparison.html
import os
dir_a=os.environ["AUDIO_DIR"]
# directories to compare files between
# set to the same directory to compare files between themselves
dir_b=dir_a
# file to write matches to
match_file = 'matches.txt'
# seconds to sample audio file for
sample_length = 30
# number of points to crop from beginning of fingerprint
# 4096 / 11025 Hz / 3 = 0.124 seconds per point
crop = 3
# number of points to scan cross correlation over
span = 100
# step size (in points) of cross correlation
step = 1
# report match when cross correlation has a peak exceeding threshold
threshold = 0
################################################################################
# import modules
################################################################################
import re
import commands
import numpy
import math
################################################################################
# function definitions
################################################################################
# adds escape characters in front of Bash special characters
def esc_bash_chars(string):
# match any of the following characters between the capital A's
# A`!$^&*()=[]{}\|;:'",<>? A
# note that characters ] and ' need escape characters themselves in the
# regex, and \ requires two escape characters
specialchars = re.compile('[`!$^&*()=[\]{}\\\|;:\'",<>? ]')
string_escaped = ""
for char in string:
if specialchars.search(char):
string_escaped += '\\' + char
else:
string_escaped += char
return string_escaped
# returns variance of list
def variance(listx):
meanx = numpy.mean(listx)
# get mean of x^2
meanx_sqr = 0
for x in listx:
meanx_sqr += x**2
meanx_sqr = meanx_sqr / float(len(listx))
return meanx_sqr - meanx**2
# returns correlation between lists
def correlation(listx, listy):
if len(listx) != len(listy):
return -2
meanx = numpy.mean(listx)
meany = numpy.mean(listy)
covariance = 0
for i in range(len(listx)):
covariance += (listx[i] - meanx) * (listy[i] - meany)
covariance = covariance / float(len(listx))
return covariance / (math.sqrt(variance(listx)) * math.sqrt(variance(listy)))
# return cross correlation, with listy offset from listx
def cross_correlation(listx, listy, offset):
if offset > 0:
listx = listx[offset:]
listy = listy[:len(listx)]
elif offset < 0:
offset = -offset
listy = listy[offset:]
listx = listx[:len(listy)]
return correlation(listx, listy)
# cross correlate listx and listy with offsets from -span to span
def compare(listx, listy, span, step):
corr_xy = []
for offset in numpy.arange(-span, span + 1, step):
corr_xy.append(cross_correlation(listx, listy, offset))
return corr_xy
# return index of maximum value in list
def max_index(listx):
max_index = 0
max_value = listx[0]
for i, value in enumerate(listx):
if value > max_value:
max_value = value
max_index = i
return max_index
# write to a file
def write_string(string, filename):
file_out = open(filename, 'ab')
file_out.write(string + '\n')
file_out.close()
################################################################################
# main code
################################################################################
# escape Bash special characters
dir_a = esc_bash_chars(dir_a)
dir_b = esc_bash_chars(dir_b)
match_file = esc_bash_chars(match_file)
# get list of files to compare from each directory
filelist_a = commands.getoutput('ls ' + dir_a + '*.*').split('\n')
filelist_b = commands.getoutput('ls ' + dir_b + '*.*').split('\n')
# if cross-correlating between files within a directory, don't correlate files
# twice, or correlate files with themselves
intra_correlating = False
if filelist_a == filelist_b:
intra_correlating = True
for i, file_a in enumerate(filelist_a):
# if correlating between files within a directory, set filelist_b such that
# cross-correlations are not repeated, and files are not correlated with
# themselves
if intra_correlating:
# remove files already correlated with from filelist_b, along with
# current file
filelist_b = filelist_a[i+1:]
if len(filelist_b) == 0:
# nothing left to check!
break
file_a = esc_bash_chars(file_a)
# calculate fingerprint
# ewd qqq
# fpcalc_out = commands.getoutput('./fpcalc -raw -length ' \
# + str(sample_length) + ' ' + file_a)
import shlex
cli = './fpcalc -raw -length ' + str(sample_length) + ' ' + file_a
from subprocess import Popen, PIPE
cli_parts = shlex.split(cli)
fpcalc_out = Popen(cli_parts, stdin=PIPE, stderr=PIPE, stdout=PIPE)
fpcalc_out.communicate()[0]
# ewd qqq end
fingerprint_index = fpcalc_out.find('FINGERPRINT=') + 12
# convert fingerprint to list of integers
fingerprint_a = map(int, fpcalc_out[fingerprint_index:].split(','))
for file_b in filelist_b:
file_b = esc_bash_chars(file_b)
# calculate fingerprint
fpcalc_out = commands.getoutput('./fpcalc -raw -length ' \
+ str(sample_length) + ' ' + file_b)
fingerprint_index = fpcalc_out.find('FINGERPRINT=') + 12
# convert fingerprint to list of integers
fingerprint_b = map(int, fpcalc_out[fingerprint_index:].split(','))
# cross correlation between fingerprints
corr_ab = compare(fingerprint_a[crop:], fingerprint_b[crop:], span, step)
max_corr_index = max_index(corr_ab)
max_corr_offset = -span + max_corr_index * step
# report matches
if corr_ab[max_corr_index] > threshold: #qqqewd 0:
# print(file_a + ' and ' + file_b + ' match with correlation of ' \
# + str(corr_ab[max_corr_index]) + ' at offset ' \
# + str(max_corr_offset))
write_string(file_a + ' ' + file_b + '\t' \
+ str(corr_ab[max_corr_index])

Related

How to integrate a bash command into a python code [duplicate]

This question already has answers here:
Variable interpolation in Python [duplicate]
(5 answers)
Closed 3 years ago.
everyone,
I'm looking to integrate a bash command into my python code to calculate indices. My problem is that I want to have an output image with a band for each of the calculated indices, but I can't integrate these indices by the bash command into my 'im_index' matrix created with my python code. I don't see how to link both of them... Do you have any idea?
import numpy as np
import sys
import os
import spectral as sp
from scipy import ndimage
import pylab as pl
from math import *
import spectral.io.envi as envi
#------------------------------------
def reject_outliers(data, m=1):
return data[abs(data - np.mean(data)) < m * np.std(data)]
#------------------------------------
def find_nearest(array, value):
#For a given value, find the nearest value in an array
array = np.asarray(array)
idx = (np.abs(array - value)).argmin()
return idx
#------------------------------------
#Open existing dataset
src_directory = "/d/afavro/Bureau/4_reflectance/"
dossier = os.listdir (src_directory)
print(dossier)
for fichier in dossier:
print (fichier)
ssrc_directory = "/d/afavro/Bureau/4_reflectance/" + fichier + "/"
rasters = os.listdir (ssrc_directory)
print(rasters)
OUTPUT_FOLDER = "/d/afavro/Bureau/5_indices2/" + 'indices_' + fichier + '/'
print(OUTPUT_FOLDER)
if not os.path.exists(OUTPUT_FOLDER):
os.makedirs(OUTPUT_FOLDER)
for image in rasters:
print(image)
name, ext = os.path.splitext(image)
if ext == '.hdr':
img = sp.open_image(ssrc_directory + image)
print(image)
im_HS = img[:,:,:]
cols = im_HS.shape[0] # Number of column
rows = im_HS.shape[1] # Number of lines
bands = im_HS.shape[2] # Number of bands
NbPix = cols * rows # Number of pixels
#Get wavelengths from hdr file
wv = np.asarray(img.bands.centers)
if len(wv) == 0 :
print("Wavelengths not defined in the hdr file")
sys.exit("Try again!")
if wv[0] > 100:
wv=wv*0.001 # Convert to micrometers if necessary
im_HS=im_HS.reshape(NbPix, bands)
#Compute HC index------------------------------------------------------
Nind=4 # Number of indice to be computed
im_index=np.zeros((cols*rows, Nind))
names = []
##NDVI computation
names.append('NDVI')
bande_ref=[0.67, 0.8]
bRef0 = find_nearest(wv,bande_ref[0])
bRef1 = find_nearest(wv,bande_ref[1])
#Check if the required specral bands are available
if (np.abs(wv[bRef0]-bande_ref[0])<=0.1 and np.abs(wv[bRef1]-bande_ref[1])<=0.1):
b0 = im_HS[:, bRef0]
b1 = im_HS[:, bRef1]
index = (b0 - b1) / (b0 + b1)
else:
index = np.zeros_like(im_HS[:,0])
print("Wavelengths selection problem, NDVI not computed")
im_index[:,0]= index
# bash command :
inRaster = ssrc_directory + image
print(inRaster)
outRaster = OUTPUT_FOLDER + 'indices_' + image
print (outRaster)
cmd = 'otbcli_RadiometricIndices -in inRaster -list Soil:BI Vegetation:MSAVI Vegetation:SAVI -out outRaster'
os.system(cmd)
#saving
im_index=im_index.reshape(cols, rows, Nind)
file_image = OUTPUT_FOLDER + "indices2_" + fichier
header = envi.read_envi_header(ssrc_directory + image)
header ['description'] = "fichier d'origine " + image
header ['band names'] = ['NDVI', 'Sober filter', 'NDWI', 'IB(1)', 'IB(2)']
del header['wavelength units']
del header['wavelength']
sp.envi.save_image(file_image + '.hdr', im_index, metadata=header, force = True, interleave = 'bsq')
Assuming this is the code you are actually asking about:
inRaster = ssrc_directory + image
print(inRaster)
outRaster = OUTPUT_FOLDER + 'indices_' + image
print (outRaster)
cmd = 'otbcli_RadiometricIndices -in inRaster -list Soil:BI Vegetation:MSAVI Vegetation:SAVI -out outRaster'
os.system(cmd)
Of course, inRaster inside of singe quotes is just a literal string; to interpolate the variable's value you can say
cmd = 'otbcli_RadiometricIndices -in ' + inRaster + \
' -list Soil:BI Vegetation:MSAVI Vegetation:SAVI -out ' + \
outRaster
or
cmd = 'otbcli_RadiometricIndices -in {0} -list Soil:BI Vegetation:MSAVI Vegetation:SAVI -out {1}'.format(
inRaster, outRaster)
or a number of other string interpolation techniques in Python (legacy % formatting, f-string, etc). But a better solution is to replace os.system with the more flexible and versatile subprocess, as suggested even in the os.system documentation.
subprocess.run([
'otbcli_RadiometricIndices',
'-in', inRaster,
'-list', 'Soil:BI', 'Vegetation:MSAVI', 'Vegetation:SAVI',
'-out', outRaster], check=True)
subprocess.run was introduced in Python 3.5; if you need compatibility with older versions, try subprocess.check_call or even the crude subprocess.call.
I think you might be looking for the subprocess package. An example:
>>> import subprocess as sp
>>> output = sp.check_output('echo hello world', shell=True)
>>> print(output)
b'hello world\n'
The check_output() method can be used to collect the stdout from a command. You'd need to parse the output to get integer indices afterwards.

python multiprocessing map_async does take much more time than the sequential

I want to find optimum path for the Travelling salesman problem, it works fine with the sequential algorithm but I have a problem with the parallel algorithm, for example when I run sequentially everything is ok but in parallel, it took 200x more time in 4 cores computer.
Here is my sequential code:
#!/usr/local/bin/python
#Traveling Salesman Solution
#Scott Stevenson, Jack Koppenaal, John Costantino
import time, itertools, math
#Get the city file
def getFile():
cityFile = 'in.txt'
try:
f = open(cityFile,'r')
return f
except Exception as e:
print(cityFile + ' could not be opened: ' + str(e))
def find_max():
file = open('in.txt','r')
max = file.read().split()
return max[1]
def initfiles(path):
file = open(path, 'r')
max = file.read().splitlines()
file.close()
file = open(path, 'r')
num = file.read().split()
file.close()
final = list()
final.append(str(num[0] + ' ' + num[1] + ' OK'))
if (num[2] != 'OK'):
print('File has been edited.')
file_write = open(path, 'wt')
for i in range(1, int(num[0]) + 1):
final.append(str(i) + ' ' + str(max[i]))
for line in range(len(final)):
file_write.write(str(final[line]) + '\n')
file_write.close()
#Get distance between 2 cities
#Cities stored as [ident, x, y]
def getDistance(city1, city2):
return math.sqrt((int(city2[1]) - int(city1[1]))**2 + (int(city2[2])-int(city1[2]))**2)
#Get the cities from a specified city file
def getCities(cityFile):
for line in cityFile:
try:
#Lines split by a space char will look like:
#[ident, x-coord, y-coord]
ident = line.split(' ')[0]
x = line.split(' ')[1]
y = line.split(' ')[2].strip('\n')
#print(x,y,ident)
#If the ident is not an int (not a city) skip it otherwise add it
if ident.isdigit():
#Cities are just lists with values (almost a pseudo-class)
city = [int(ident), int(x), int(y)]
cities.append(city)
except:
#The ident was not an int so we skip (pass) it and move on to the next
pass
#A function for bruteforcing the TSP problem based on a list of cities
def bruteForce(cities):
global maxweight
#Tours are also stored as pseudo-class lists
#tour[0] is the path and tour[1] is the weight
#Make a start tour with a weight of infinity so all other tours will be smaller
tour = [[], float("inf")]
permparm = []
#In order to get all permutations we need an array containing the values 1 through n
#These values are the idents of the cities (their 0th element) so we get and add them
for city in cities:
permparm.append(city[0])
#We now generate all permutations of n length from the array containing 1 - n idents
#and loop through them looking for the smallest distance
for perm in list(itertools.permutations(permparm, len(permparm))):
#Get the total weight of the permutation
dist = getWeight(perm)
#Make a new tour to represent the current permutation
thisTour = [perm, dist]
#If the current tour is shorter than the old tour, point the old tour to the new one
if thisTour[1] < tour[1] and thisTour[1]<=int (maxweight):
tour = thisTour
return tour
#Once we have gone through every permutation we have the shortest tour so return it
return tour
#A function to get the total weight of a path
#This function is messy because of an off-by-1 error introduced by the tour file starting at 1 instead of 0
def getWeight(perm):
#Set the initial distance to 0
dist = 0
#We now need to calculate and add the distance between each city in the path from 0 to n
for index in range(len(perm)):
try:
#Pass the 2 cities to the distance formula and add the value to dist
dist += getDistance(cities[perm[index]-1], cities[perm[index+1]-1])
except:
#We don't need to check bounds because the final pass will throw an out-of-bounds
#exception so we just catch it and skip that calculation
pass
#All TSP solutions are cycles so we now have to add the final city back to the initial city to the total dist
#Python has a nifty convention where list[-1] will return the last element, list[-2] will return second to last, etc.
dist += getDistance(cities[perm[-1]-1], cities[perm[0]-1])
#We now have the total distance so return it
#if (int(dist)<=80.0):
return dist
#A function to write the output of a tour to a file in a specified format
def toFile(tour):
loc = input('Enter the location where you would like to save the tour:\n')
fname = cityFile.name
#Index for the last file separator to get ONLY the file name not its path
sep_index = 0
#Linux/OSX files use /'s to separate dirs so get the position of the last one in the name
if '/' in fname:
sep_index = fname.rindex('/')+1
#Windows files use \'s to separate dirs so get the position of the last one in the name
if '\\' in fname:
sep_index = fname.rindex('\\')+1
#Create the header for the output file
header = ('NAME : ' + str(fname[sep_index:-4]) + '.opt.tour\n'
'COMMENT : Optimal tour for ' + str(fname[sep_index:]) + ' (' + str(tour[1]) + ')\n'
'TYPE : Tour\n'
'DIMENSON : ' + str(len(tour[0])) + '\n'
'TOUR_SECTION\n')
#Create the trailer for the output file
trailer = "-1\nEOF\n"
#Create the output file and write the results to it
try:
f = open(loc,'w')
f.write(header)
for city in tour[0]:
f.write(str(city) + '\n')
f.write(trailer)
f.close()
print ('Successfully saved tour data to: ' + loc)
except Exception as e:
print (loc + ' could not be written to: ' + str(e))
#-------------------The actual script begins here-----------------------
cities = []
cityFile = getFile()
maxweight = find_max()
initfiles('in.txt')
#Start the stopwatch
start = time.time()
getCities(cityFile)
opt_tour = bruteForce(cities)
#Stop the stopwatch
finish = time.time()
print ('The optimum tour is: %s (%f)' % (opt_tour[0], opt_tour[1]))
print ('This solution took %0.3f seconds to calculate.' % (finish-start))
And my parallel code:
#!/usr/local/bin/python
import time, itertools, math
from multiprocessing import Pool,Manager
thisTour = None
tour = None
dist = None
totalct = 0
#Get the city file
def getFile():
cityFile = 'in.txt'
try:
f = open(cityFile,'r')
return f
except Exception as e:
print(cityFile + ' could not be opened: ' + str(e))
def find_max():
file = open('in.txt','r')
max = file.read().split()
return max[1]
def initfiles(path):
global totalct
file = open(path, 'r')
max = file.read().splitlines()
totalct = len(max) - 2
file.close()
file = open(path, 'r')
num = file.read().split()
file.close()
final = list()
#print(totalct)
final.append(str(num[0] + ' ' + num[1] + ' OK'))
if (num[2] != 'OK'):
file_write = open(path, 'wt')
for i in range(1, int(num[0]) + 1):
# print(max[i])
final.append(str(i) + ' ' + str(max[i]))
# print(i)
# print(len(final))
for line in range(len(final)):
# print(final[line])
file_write.write(str(final[line]) + '\n')
print('File has been edited.')
file_write.close()
#Get distance between 2 cities
#Cities stored as [ident, x, y]
def getDistance(city1, city2):
return math.sqrt((int(city2[1]) - int(city1[1]))**2 + (int(city2[2])-int(city1[2]))**2)
#Get the cities from a specified city file
def getCities(cityFile):
for line in cityFile:
try:
#Lines split by a space char will look like:
#[ident, x-coord, y-coord]
ident = line.split(' ')[0]
x = line.split(' ')[1]
y = line.split(' ')[2].strip('\n')
#If the ident is not an int (not a city) skip it otherwise add it
if ident.isdigit():
#Cities are just lists with values (almost a pseudo-class)
city = [int(ident), int(x), int(y)]
cities.append(city)
except:
#The ident was not an int so we skip (pass) it and move on to the next
pass
#A function for bruteforcing the TSP problem based on a list of cities
def bruteForce(perm):
global maxweight
global tour
global thisTour
dist = getWeight(perm)
thisTour = [perm, dist]
#If the current tour is shorter than the old tour, point the old tour to the new one
if thisTour[1] < tour[1] and thisTour[1] <= int(maxweight):
tour = thisTour
return tour
#Once we have gone through every permutation we have the shortest tour so return it
return tour
#A function to get the total weight of a path
#This function is messy because of an off-by-1 error introduced by the tour file starting at 1 instead of 0
def getWeight(perm):
#Set the initial distance to 0
dist = 0
#We now need to calculate and add the distance between each city in the path from 0 to n
for index in range(len(perm)):
try:
#Pass the 2 cities to the distance formula and add the value to dist
dist += getDistance(cities[perm[index]-1], cities[perm[index+1]-1])
except:
#We don't need to check bounds because the final pass will throw an out-of-bounds
#exception so we just catch it and skip that calculation
pass
#All TSP solutions are cycles so we now have to add the final city back to the initial city to the total dist
#Python has a nifty convention where list[-1] will return the last element, list[-2] will return second to last, etc.
dist += getDistance(cities[perm[-1] - 1], cities[perm[0] - 1])
#We now have the total distance so return it
return dist
#A function to write the output of a tour to a file in a specified format
def toFile(tour):
loc = input('Enter the location where you would like to save the tour:\n')
fname = cityFile.name
#Index for the last file separator to get ONLY the file name not its path
sep_index = 0
#Linux/OSX files use /'s to separate dirs so get the position of the last one in the name
if '/' in fname:
sep_index = fname.rindex('/')+1
#Windows files use \'s to separate dirs so get the position of the last one in the name
if '\\' in fname:
sep_index = fname.rindex('\\')+1
#Create the header for the output file
header = ('NAME : ' + str(fname[sep_index:-4]) + '.opt.tour\n'
'COMMENT : Optimal tour for ' + str(fname[sep_index:]) + ' (' + str(tour[1]) + ')\n'
'TYPE : Tour\n'
'DIMENSON : ' + str(len(tour[0])) + '\n'
'TOUR_SECTION\n')
#Create the trailer for the output file
trailer = "-1\nEOF\n"
#Create the output file and write the results to it
try:
f = open(loc,'w')
f.write(header)
for city in tour[0]:
f.write(str(city) + '\n')
f.write(trailer)
f.close()
print ('Successfully saved tour data to: ' + loc)
except Exception as e:
print (loc + ' could not be written to: ' + str(e))
#-------------------The actual script begins here-----------------------
def permutations():
allperm = list(itertools.permutations(act, len(act)))
for i in allperm:
allpermlist.append(list(i))
print('permutations compute has been finished.')
return allpermlist
def allct ():
for i in range(1, totalct):
act.append(i)
def init():
global thisTour
global tour
global dist
if __name__ == "__main__":
cities = []
cityFile = getFile()
maxweight = find_max()
initfiles('in.txt')
manager = Manager()
allpermlist = manager.list()
tour = [[], float("inf")]
allpermlist = []
act = []
allct()
permutations()
#Start the stopwatch
start = time.time()
getCities(cityFile)
with Pool(processes=8, initializer=init) as p:
opt_tour = p.map_async(bruteForce, allpermlist, chunksize=2048)
opt_tour.wait()
print(opt_tour.get())
p.close()
#p.join()
finish = time.time()
print('This solution took %0.3f seconds to calculate.' % (finish-start))
And my in.txt :
9 47 OK
1 13 15
2 4 21
3 7 17
4 8 11
5 10 14
6 2 15
7 14 11
8 15 20
9 13 17

Is there some kind of limit to the amount of output Python 3.4 allows using the write() method at one time?

I put trailing print() methods right next to my write() method lines at the end of my code to test why my output files were incomplete. But, the print() output is "all the stuff" I expect; while the write() output is off by a confusing amount (only 150 out of 200 'things'). Reference Image of Output: IDLE versus external output file
FYI: Win 7 64 // Python 3.4.2
My modules take an SRT captions file ('test.srt') and returns a list object I create from it; in particular, one with 220 list entries of the form: [[(index), [time], string]]
times = open('times.txt', 'w')
### A portion of Riobard's SRT Parser: srt.py
import re
def tc2ms(tc):
''' convert timecode to millisecond '''
sign = 1
if tc[0] in "+-":
sign = -1 if tc[0] == "-" else 1
tc = tc[1:]
TIMECODE_RE = re.compile('(?:(?:(?:(\d?\d):)?(\d?\d):)?(\d?\d))?(?:[,.](\d?\d?\d))?')
match = TIMECODE_RE.match(tc)
try:
assert match is not None
except AssertionError:
print(tc)
hh,mm,ss,ms = map(lambda x: 0 if x==None else int(x), match.groups())
return ((hh*3600 + mm*60 + ss) * 1000 + ms) * sign
# my code
with open('test.srt') as f:
file = f.read()
srt = []
for line in file:
splitter = file.split("\n\n")
# SRT splitter
i = 0
j = len(splitter)
for items in splitter:
while i <= j - 2:
split_point_1 = splitter[i].index("\n")
split_point_2 = splitter[i].index("\n", split_point_1 + 1)
index = splitter[i][:split_point_1]
time = [splitter[i][split_point_1:split_point_2]]
time = time[0][1:]
string = splitter[i][split_point_2:]
string = string[1:]
list = [[(index), [time], string]]
srt += list
i += 1
# time info outputter
i = 0
j = 1
for line in srt:
if i != len(srt) - 1:
indexer = srt[i][1][0].index(" --> ")
timein = srt[i][1][0][:indexer]
timeout = srt[i][1][0][-indexer:]
line_time = (tc2ms(timeout) - tc2ms(timein))/1000
space_time = ((tc2ms((srt[j][1][0][:indexer]))) - (tc2ms(srt[i][1][0][-indexer:])))/1000
out1 = "The space between Line " + str(i) + " and Line " + str(j) + " lasts " + str(space_time) + " seconds." + "\n"
out2 = "Line " + str(i) + ": " + str(srt[i][2]) + "\n\n"
times.write(out1)
times.write(out2)
print(out1, end="")
print(out2)
i += 1
j += 1
else:
indexer = srt[i][1][0].index(" --> ")
timein = srt[i][1][0][:indexer]
timeout = srt[i][1][0][-indexer:]
line_time = (tc2ms(timeout) - tc2ms(timein))/1000
outend = "Line " + str(i) + ": " + str(srt[i][2]) + "\n<End of File>"
times.write(outend)
print(outend)
My two write() method output files, respectively, only print out either ~150 or ~200 items of the 220 things it otherwise correctly prints to the screen.
You want to close your times file when done writing; operating systems use write buffers to speed up file I/O, collecting larger blocks of data to be written to disk in one go; closing the file flushes that buffer:
times.close()
Consider opening the file in a with block:
with open('times.txt', 'w') as times:
# all code that needs to write to times

Check which label or name a part (from krn) has in music21

I want to extract 2 parts from 4 voice krn score and save them as a midi file.
I can load the files:
s = converter.parse('/something.krn')
I can get some basic info like this:
s.metadata.title
In v2, I want to store the part of s that has a label "Cantus". Any idea how to check for a label? They have a label in krn.
Once I have the number of the part, I can get it with
s.parts[i]
The krn file is defined like this:
**kern **kern **kern **kern **kern
*Ibass *Itenor *Itenor *Icalto *Icant
!Bassus !Tenor 2 !Tenor 1 !Altus !Cantus
I am guessing labels is not the correct name, as I can't find this in music21 documentation, perhaps the name of the part?
I can't seem to find the property in the music21 documentation.
I was finally able to do it this way:
import sys
from music21 import *
import os
# input ("Please make sure that you have places all the krn files in a subdirectory called data. Press enter to continue")
for filename in os.listdir('./data'):
s = converter.parse('./data/' + filename)
sys.stdout.write('Processing ' + filename + '... ')
numcant = -1
nums = list()
try:
length = len(s.parts)
except:
length = 0
if (length > 0):
for num in range(0,length):
# sys.stdout.write(s.parts[num].flat.getElementsByClass('SpineComment')[0].comment + ' - ')
if (s.parts[num].flat.getElementsByClass('SpineComment')[0].comment == "Cantus"):
numcant = num
# print "cant "
# print numcant
else:
# print "nums"
nums.append(num)
# print num
else:
# sys.stdout.write(' - no parts present.')
sys.stdout.write('\n')
try:
length = len(nums)
except:
length = 0
if (length > 0):
sys.stdout.write('\n')
if (numcant != -1):
for num in nums:
sys.stdout.write(' - ' + filename[:-4] + '_' + str(num) + '.mid written.\n')
# print "cantus present"
s2 = stream.Stream()
s2.insert(0, s.parts[num])
s2.insert(0, s.parts[numcant])
# write the midi file
s2.write('midi', './midi/' + filename[:-4] + '_' + str(num) + '.mid')
# sys.stdout.write('I')
else:
sys.stdout.write(' - no cantus specified for this file.\n')
else:
sys.stdout.write(' - not enough parts in this file.\n')
sys.stdout.write('\n')

Cutting character values according to value from file

This is the which i am doing
import csv
output = open('output.txt' , 'wb')
# this functions return the min for num.txt
def get_min(num):
return int(open('%s.txt' % num, 'r+').readlines()[0])
# temporary variables
last_line = ''
input_list = []
#iterate over input.txt in sort the input in a list of tuples
for i, line in enumerate(open('input.txt', 'r+').readlines()):
if i%2 == 0:
last_line = line
else:
input_list.append((last_line, line))
filtered = [(header, data[:get_min(header[-2])] + '\n' ) for (header, data) in input_list]
[output.write(''.join(data)) for data in filtered]
output.close()
In this code input.txt is something like this
>012|013|0|3|M
AFDSFASDFASDFA
>005|5|67|0|6
ACCTCTGACC
>029|032|4|5|S
GGCAGGGAGCAGGCCTGTA
and num.txt is something like this
M 4
P 10
I want that in above input.txt check the amount of value from the num.txt by looking at its last column which is same like in num.txt and cut its character according to that values
I think the error in my code is that it only accept the integer text file , where it should also accept file which contain alphabets
The totally revised version, after a long chat with the OP;
import os
import re
# Fetch all hashes and counts
file_c = open('num.txt')
file_c = file_c.read()
lines = re.findall(r'\w+\.txt \d+', file_c)
numbers = {}
for line in lines:
line_split = line.split('.txt ')
hash_name = line_split[0]
count = line_split[1]
numbers[hash_name] = count
#print(numbers)
# The input file
file_i = open('input.txt')
file_i = file_i.read()
for hash_name, count in numbers.iteritems():
regex = '(' + hash_name.strip() + ')'
result = re.findall(r'>.*\|(' + regex + ')(.*?)>', file_i, re.S)
if len(result) > 0:
data_original = result[0][2]
stripped_data = result[0][2][int(count):]
file_i = file_i.replace(data_original, '\n' + stripped_data)
#print(data_original)
#print(stripped_data)
#print(file_i)
# Write the input file to new input_new.txt
f = open('input_new.txt', 'wt')
f.write(file_i)
You can do it like so;
import re
min_count = 4 # this variable will contain that count integer from where to start removing
str_to_match = 'EOG6CC67M' # this variable will contain the filename you read
input = '' # The file input (input.txt) will go in here
counter = 0
def callback_f(e):
global min_count
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Only replace the value with nothing (remove it) after a certain count
if counter > min_count:
return '' # replace with nothing
result = re.sub(r''+str_to_match, callback_f, input)
With this tactic you can keep count with a global counter and there's no need to do hard line-loops with complex structures.
Update
More detailed version with file access;
import os
import re
def callback_f(e):
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Fetch all hash-file names and their content (count)
num_files = os.listdir('./num_files')
numbers = {}
for file in num_files:
if file[0] != '.':
file_c = open('./num_files/' + file)
file_c = file_c.read()
numbers[file.split('.')[0]] = file_c
# Now the CSV files
csv_files = os.listdir('./csv_files')
for file in csv_files:
if file[0] != '.':
for hash_name, min_count in numbers.iteritems():
file_c = open('./csv_files/' + file)
file_c = file_c.read()
counter = 0
result = re.sub(r''+hash_name, callback_f, file_c)
# Write the replaced content back to the file here
Considered directory/file structure;
+ Projects
+ Project_folder
+ csv_files
- input1.csv
- input2.csv
~ etc.
+ num_files
- EOG6CC67M.txt
- EOG62JQZP.txt
~ etc.
- python_file.py
The CSV files contain the big chunks of text you state in your original question.
The Num files contain the hash-files with an Integer in them
What happens in this script;
Collect all Hash files (in a dictionary) and it's inner count number
Loop through all CSV files
Subloop through the collected numbers for each CSV file
Replace/remove (based on what you do in callback_f()) hashes after a certain count
Write the output back (it's the last comment in the script, would contain the file.write() functionality)

Categories