I want to know how to get samples out of a .wav file in order to perform windowed join of two .wav files.
Can any one please tell how to do this?
The wave module of the standard library is the key: after of course import wave at the top of your code, wave.open('the.wav', 'r') returns a "wave read" object from which you can read frames with the .readframes method, which returns a string of bytes which are the samples... in whatever format the wave file has them (you can determine the two parameters relevant to decomposing frames into samples with the .getnchannels method for the number of channels, and .getsampwidth for the number of bytes per sample).
The best way to turn the string of bytes into a sequence of numeric values is with the array module, and a type of (respectively) 'B', 'H', 'L' for 1, 2, 4 bytes per sample (on a 32-bit build of Python; you can use the itemsize value of your array object to double-check this). If you have different sample widths than array can provide you, you'll need to slice up the byte string (padding each little slice appropriately with bytes worth 0) and use the struct module instead (but that's clunkier and slower, so use array instead if you can).
You can use the wave module. First you should read the metadata, such us sample size or the number of channels. Using the readframes() method, you can read samples, but only as a byte string. Based on the sample format, you have to convert them to samples using struct.unpack().
Alternatively, if you want the samples as an array of floating-point numbers, you can use SciPy's io.wavfile module.
Here's a function to read samples from a wave file (tested with mono & stereo):
def read_samples(wave_file, nb_frames):
frame_data = wave_file.readframes(nb_frames)
if frame_data:
sample_width = wave_file.getsampwidth()
nb_samples = len(frame_data) // sample_width
format = {1:"%db", 2:"<%dh", 4:"<%dl"}[sample_width] % nb_samples
return struct.unpack(format, frame_data)
else:
return ()
And here's the full script that does windowed mixing or concatenating of multiple .wav files. All input files need to have the same params (# of channels and sample width).
import argparse
import itertools
import struct
import sys
import wave
def _struct_format(sample_width, nb_samples):
return {1:"%db", 2:"<%dh", 4:"<%dl"}[sample_width] % nb_samples
def _mix_samples(samples):
return sum(samples)//len(samples)
def read_samples(wave_file, nb_frames):
frame_data = wave_file.readframes(nb_frames)
if frame_data:
sample_width = wave_file.getsampwidth()
nb_samples = len(frame_data) // sample_width
format = _struct_format(sample_width, nb_samples)
return struct.unpack(format, frame_data)
else:
return ()
def write_samples(wave_file, samples, sample_width):
format = _struct_format(sample_width, len(samples))
frame_data = struct.pack(format, *samples)
wave_file.writeframes(frame_data)
def compatible_input_wave_files(input_wave_files):
nchannels, sampwidth, framerate, nframes, comptype, compname = input_wave_files[0].getparams()
for input_wave_file in input_wave_files[1:]:
nc,sw,fr,nf,ct,cn = input_wave_file.getparams()
if (nc,sw,fr,ct,cn) != (nchannels, sampwidth, framerate, comptype, compname):
return False
return True
def mix_wave_files(output_wave_file, input_wave_files, buffer_size):
output_wave_file.setparams(input_wave_files[0].getparams())
sampwidth = input_wave_files[0].getsampwidth()
max_nb_frames = max([input_wave_file.getnframes() for input_wave_file in input_wave_files])
for frame_window in xrange(max_nb_frames // buffer_size + 1):
all_samples = [read_samples(wave_file, buffer_size) for wave_file in input_wave_files]
mixed_samples = [_mix_samples(samples) for samples in itertools.izip_longest(*all_samples, fillvalue=0)]
write_samples(output_wave_file, mixed_samples, sampwidth)
def concatenate_wave_files(output_wave_file, input_wave_files, buffer_size):
output_wave_file.setparams(input_wave_files[0].getparams())
sampwidth = input_wave_files[0].getsampwidth()
for input_wave_file in input_wave_files:
nb_frames = input_wave_file.getnframes()
for frame_window in xrange(nb_frames // buffer_size + 1):
samples = read_samples(input_wave_file, buffer_size)
if samples:
write_samples(output_wave_file, samples, sampwidth)
def argument_parser():
parser = argparse.ArgumentParser(description='Mix or concatenate multiple .wav files')
parser.add_argument('command', choices = ("mix", "concat"), help='command')
parser.add_argument('output_file', help='ouput .wav file')
parser.add_argument('input_files', metavar="input_file", help='input .wav files', nargs="+")
parser.add_argument('--buffer_size', type=int, help='nb of frames to read per iteration', default=1000)
return parser
if __name__ == '__main__':
args = argument_parser().parse_args()
input_wave_files = [wave.open(name,"rb") for name in args.input_files]
if not compatible_input_wave_files(input_wave_files):
print "ERROR: mixed wave files must have the same params."
sys.exit(2)
output_wave_file = wave.open(args.output_file, "wb")
if args.command == "mix":
mix_wave_files(output_wave_file, input_wave_files, args.buffer_size)
elif args.command == "concat":
concatenate_wave_files(output_wave_file, input_wave_files, args.buffer_size)
output_wave_file.close()
for input_wave_file in input_wave_files:
input_wave_file.close()
After reading the samples (for example with the wave module, more details here) you may want to have the values scales between -1 and 1 (this is the convention for audio signals).
In this case, you can add:
# scale to -1.0 -- 1.0
max_nb_bit = float(2**(nb_bits-1))
samples = signal_int / (max_nb_bit + 1.0)
with nb_bits the bit depth and signal_int the integers values.
Related
I faced the module Struct for the first time and my code gives me an error: "unpack requires a buffer of 1486080 bytes"
Here is my code:
def speed_up(n):
source = wave.open('sound.wav', mode='rb')
dest = wave.open('out.wav', mode='wb')
dest.setparams(source.getparams())
frames_count = source.getnframes()
data = struct.unpack("<" + str(frames_count) + "h", source.readframes(frames_count))
new_data = []
for i in range(0, len(data), n):
new_data.append(data[i])
newframes = struct.pack('<' + str(len(new_data)) + 'h', new_data)
dest.writeframes(newframes)
source.close()
dest.close()
How to figure out which format should I use?
The issue in your code is that you're providing struct.unpack with the wrong number of bytes. This is because of your usage of the wave module: Each frame in a wave file has getnchannels() samples, so when calling readframes(n) you will get back n * getnchannels() samples and this is the number you'd have to pass to struct.unpack.
To make your code more robust, you'd also need to look at getsampwidth() and use an appropriate format character, but the vast majority of wave files are 16-bit.
In the comments you also mentioned that the code didn't work after adding print(len(source.readframes(frames_count))). You didn't show the full code but I assume this is because you called readframes twice without calling rewind, so the second call didn't have any more data to return. It would be best to store the result in a variable if you want to use it in multiple lines.
I am trying to read bytes from an image, and get all the int (16 bit) values from that image.
After I parsed the image header, I got to the pixel values. The values that I get when the pair of bytes are like b"\xd4\x00" is incorrect. In this case it should be 54272, not 3392.
This are parts of the code:
I use a generator to get the bytes:
import itertools
def osddef_generator(in_file):
with open(in_file, mode='rb') as f:
dat = f.read()
for byte in dat:
yield byte
def take_slice(in_generator, size):
return ''.join(str(chr(i)) for i in itertools.islice(in_generator, size))
def take_single_pixel(in_generator):
pix = itertools.islice(in_generator, 2)
hex_list = [hex(i) for i in pix]
hex_str = "".join(hex_list)[2:].replace("0x", '')
intval = int(hex_str, 16)
print("hex_list: ", hex_list)
print("hex_str: ", hex_str)
print("intval: ", intval)
After I get the header correctly using the take_slice method, I get to the part with the pixel values, where I use the take_single_pixel method.
Here, I get the bad results.
This is what I get:
hex_list: ['0xd4', '0x0']
hex_str: d40
intval: 3392
But the actual sequence of bytes that should be interpreted is: \xd4\x00, which equals to 54272, so that my hex_list = ['0xd4', '0x00'] and hex_str = d400.
Something happens when I have a sequence of bytes when the second one is \x00.
Got any ideas? Thanks!
There are much better ways of converting bytes to integters:
int.from_bytes() takes bytes input, and a byte order argument:
>>> int.from_bytes(b"\xd4\x00", 'big')
54272
>>> int.from_bytes(b"\xd4\x00", 'little')
212
The struct.unpack() function lets you convert a whole series of bytes to integers following a pattern:
>>> import struct
>>> struct.unpack('!4H', b'\xd4\x00\xd4\x00\xd4\x00\xd4\x00')
(54272, 54272, 54272, 54272)
The array module lets you read binary data representing homogenous integer data into a memory structure efficiently:
>>> array.array('H', fileobject)
However, array can't be told what byte order to use. You'd have to determine the current architecture byte order and call arr.byteswap() to reverse order if the machine order doesn't match the file order.
When reading image data, it is almost always preferable to use the struct module to do the parsing. You generally then use file.read() calls with specific sizes; if the header consists of 10 bytes, use:
headerinfo = struct.unpack('<expected header pattern for 10 bytes>', f.read(10))
and go from there. For examples, look at the Pillow / PIL image plugins source code; here is how the Blizzard Mipmap image format header is read:
def _read_blp_header(self):
self._blp_compression, = struct.unpack("<i", self.fp.read(4))
self._blp_encoding, = struct.unpack("<b", self.fp.read(1))
self._blp_alpha_depth, = struct.unpack("<b", self.fp.read(1))
self._blp_alpha_encoding, = struct.unpack("<b", self.fp.read(1))
self._blp_mips, = struct.unpack("<b", self.fp.read(1))
self._size = struct.unpack("<II", self.fp.read(8))
if self.magic == b"BLP1":
# Only present for BLP1
self._blp_encoding, = struct.unpack("<i", self.fp.read(4))
self._blp_subtype, = struct.unpack("<i", self.fp.read(4))
self._blp_offsets = struct.unpack("<16I", self.fp.read(16 * 4))
self._blp_lengths = struct.unpack("<16I", self.fp.read(16 * 4))
Because struct.unpack() always returns tuples, you can assign individual elements in a tuple to name1, name2, ... names on the left-hand size, including single_name, = assignments to extract a single result.
The separate set of read calls above could also be compressed into fewer calls:
comp, enc, adepth, aenc, mips, *size = struct.unpack("<i4b2I", self.fp.read(16))
if self.magic == b"BLP1":
# Only present for BLP1
enc, subtype = struct.unpack("<2i", self.fp.read(8))
followed by specific attribute assignments.
I have a python script that receives chunks of binary raw audio data and I would like to change the sample rate of those chunks to 16000 and then pipe them to another component.
I tried my luck with audiotools but without success:
# f is a filelike FIFO buffer
reader = PCMFileReader(f, 44100, 1, 1, 16)
conv = PCMConverter(reader, 16000, 1, 1, 16)
Then I just write to the buffer anytime, I get a new chunk:
f.write(msg)
And read from the buffer in another thread:
while not reader.file.closed:
fl = conf.read(10)
chunk = fl.to_bytes(False, True)
The problem is that I get this value error, which seems to come from a "samplerate.c" library:
ValueError: SRC_DATA->data_out is NULL
This error only occurs with resampling. If I turn off that step, then everything works fine and I get playable audio.
Therefore my question: What would be a good tool for this task? And if audiotools turns out to be the right answer, how do I do that correctly.
here is a simplified resampler code
dataFormat is a number of bytes per sample in the stream, ex: stereo 16 bit would be = 4, original_samples is a source bin string size, desired_samples is a desired bit string size, 16KHz->44K1Hz ex: original = 160 but desired = 441, pcm is a source bin string, return is resampled bin string) :
def resampleSimplified(pcm, desired_samples, original_samples, dataFormat):
samples_to_pad = desired_samples - original_samples
q, r = divmod(desired_samples, original_samples)
times_to_pad_up = q + int(bool(r))
times_to_pad_down = q
pcmList = [pcm[i:i+dataFormat] for i in range(0, len(pcm), dataFormat)]
if samples_to_pad > 0:
# extending pcm times_to_pad times
pcmListPadded = list(itertools.chain.from_iterable(
itertools.repeat(x, times_to_pad_up) for x in pcmList)
)
else:
# shrinking pcm times_to_pad times
if times_to_pad_down > 0:
pcmListPadded = pcmList[::(times_to_pad_down)]
else:
pcmListPadded = pcmList
padded_pcm = ''.join(pcmListPadded[:desired_samples])
return padded_pcm
I'm trying to unpack the data from a single channel WAVE file using struct.unpack. I want to store the data in an array and be able to manipulate it (say by adding noise of given variance). I have extracted the header data and stored it in a dictionary as follows:
stHeaderFields['ChunkSize'] = struct.unpack('<L', bufHeader[4:8])[0]
stHeaderFields['Format'] = bufHeader[8:12]
stHeaderFields['Subchunk1Size'] = struct.unpack('<L', bufHeader[16:20])[0]
stHeaderFields['AudioFormat'] = struct.unpack('<H', bufHeader[20:22])[0]
stHeaderFields['NumChannels'] = struct.unpack('<H', bufHeader[22:24])[0]
stHeaderFields['SampleRate'] = struct.unpack('<L', bufHeader[24:28])[0]
stHeaderFields['ByteRate'] = struct.unpack('<L', bufHeader[28:32])[0]
stHeaderFields['BlockAlign'] = struct.unpack('<H', bufHeader[32:34])[0]
stHeaderFields['BitsPerSample'] = struct.unpack('<H', bufHeader[34:36])[0]
When I pass in a file I get the following output:
NumChannels: 1
ChunkSize: 78476
BloackAlign: 0
Filename: foo.wav
ByteRate: 32000
BlockAlign: 2
AudioFormat: 1
SampleRate: 16000
BitsPerSample: 16
Format: WAVE
Subchunk1Size: 16
I then try to get the data by doing struct.unpack('<h', self.bufHeader[36:])[0] but doing this returns a simple integer value 24932. I'm not allowed to use the wave library or anything else to do with waves specifically as I Will have to adapt this to other sorts of signals. How can I store and manipulate the actual wave data?
EDIT:
while chunk_reader < stHeaderFields['ChunkSize']:
data.append(struct.unpack('<H', bufHeader[chunk_reader:chunk_reader+stHeaderFields['BlockAlign']]))
Okay, I'll try to write a complete walk-through.
First, it is a common mistake to treat WAV (or, more likely, RIFF) file as a linear structure. It is actually a tree, with each element having a 4-byte tag, 4-byte length of data and/or child elements, and some kind of data inside.
It is just common for WAV files to have only two child elements ('fmt ' and 'data'), but it also may have metadata ('LIST') with some child elements ('INAM', 'IART', 'ICMT' etc.) or some other elements. Also there's no actual order requirement for blocks, so it is incorrect to think that 'data' follows 'fmt ', because metadata may stick in between.
So let's look at the RIFF file:
'RIFF'
|-- file type ('WAVE')
|-- 'fmt '
| |-- AudioFormat
| |-- NumChannels
| |-- ...
| L_ BitsPerSample
|-- 'LIST' (optional)
| |-- ... (other tags)
| L_ ... (other tags)
L_ 'data'
|-- sample 1 for channel 1
|-- ...
|-- sample 1 for channel N
|-- sample 2 for channel 1
|-- ...
|-- sample 2 for channel N
L_ ...
So, how should you read a WAV file? Well, first you need to read 4 bytes from the beginning of the file and make sure it is RIFF or RIFX tag, otherwise it is not a valid RIFF file. The difference between RIFF and RIFX is the former uses little-endian encoding (and is supported everywhere) while the latter uses big-endian (and virtually nobody supports it). For simplicity let's assume we're dealing only with little-endian RIFF files.
Next you read the root element length (in file endianness) and following file type. If file type is not WAVE, it is not a WAV file, so you might abandon further processing. After reading the root element, you start to read all child elements and process those you're interested in.
Reading fmt header is pretty straightforward, and you have actually done it in your code.
Data samples are usually represented as 1, 2, 3 or 4 bytes (again, in the file endianness). The most common format is a so-called s16_le (you might have seen such naming in some audio processing utilities like ffmpeg), which means samples are presented as signed 16-bit integers in little endian. Other possible formats are u8 (8-bit samples are unsigned numbers!), s24_le, s32_le. Data samples are interleaved, so it is easy to seek to arbitrary position in a stream even for multi-channel audio. Note: this is valid only for uncompressed WAV files, as indicated by AudioFormat == 1. For other formats data samples may have another layout.
So let's take a look at a simple WAV reader:
stHeaderFields = dict()
rawData = None
with open("file.wav", "rb") as f:
riffTag = f.read(4)
if riffTag != 'RIFF':
print 'not a valid RIFF file'
exit(1)
riffLength = struct.unpack('<L', f.read(4))[0]
riffType = f.read(4)
if riffType != 'WAVE':
print 'not a WAV file'
exit(1)
# now read children
while f.tell() < 8 + riffLength:
tag = f.read(4)
length = struct.unpack('<L', f.read(4))[0]
if tag == 'fmt ': # format element
fmtData = f.read(length)
fmt, numChannels, sampleRate, byteRate, blockAlign, bitsPerSample = struct.unpack('<HHLLHH', fmtData)
stHeaderFields['AudioFormat'] = fmt
stHeaderFields['NumChannels'] = numChannels
stHeaderFields['SampleRate'] = sampleRate
stHeaderFields['ByteRate'] = byteRate
stHeaderFields['BlockAlign'] = blockAlign
stHeaderFields['BitsPerSample'] = bitsPerSample
elif tag == 'data': # data element
rawData = f.read(length)
else: # some other element, just skip it
f.seek(length, 1)
Now we know file format info and its sample data, so we can parse it. As it was said, sample may have any size, but for now let's assume we're dealing only with 16-bit samples:
blockAlign = stHeaderFields['BlockAlign']
numChannels = stHeaderFields['NumChannels']
# some sanity checks
assert(stHeaderFields['BitsPerSample'] == 16)
assert(numChannels * stHeaderFields['BitsPerSample'] == blockAlign * 8)
for offset in range(0, len(rawData), blockAlign):
samples = struct.unpack('<' + 'h' * numChannels, rawData[offset:offset+blockAlign])
# now samples contains a tuple with sample values for each channel
# (in case of mono audio, you'll have a tuple with just one element).
# you may store it in the array for future processing,
# change and immediately write to another stream, whatever.
So now you have all the samples in rawData, and you may access and modify it as you like. It might be handy to use Python's array() to effectively access and modify data (but it won't do in case of 24-bit audio, you'll need to write your own serialization and deserialization).
After you've done with data processing (which may involve upscaling or downscaling the number of bits per sample, changing number of channels, sound levels manipulation etc.), you just write a new RIFF header with correct data length (usually may be computed with a simplified formula 36 + len(rawData)), an altered fmt header and data stream.
Hope this helps.
I'm using the script found on this blog Google speech recognition with python (I give any credit to author).
import sys
import pyaudio, speex
import numpy as np # just for doing a standard deviation for audio level checks
import urllib2
import wave
e = speex.Encoder()
e.initialize(speex.SPEEX_MODEID_WB)
d = speex.Decoder()
d.initialize(speex.SPEEX_MODEID_WB)
chunk = 320 # tried other numbers... some don't work
FORMAT = pyaudio.paInt16
bytespersample=2
CHANNELS = 1
RATE = 16000 # "wideband" mode for speex. May work with 8000. Haven't tried it.
p = pyaudio.PyAudio()
# Start the stream to record the audio
stream = p.open(format = FORMAT,
channels = CHANNELS,
rate = RATE,
input = True,
output = True,
frames_per_buffer = chunk)
print "Listening. Recording will start when some sound is heard."
threshold = 200 # Adjust this to be slightly above the noise level of your recordings.
nquit = 40 # number of silent frames before terminating the program
nover = 0
keepgoing = True
spxlist=[] # list of the encoded speex packets/frames
while keepgoing:
data = stream.read(chunk) # grab 320 samples from the microphone
spxdata = e.encode(data) # encode using the speex dll
print "Length encoded: %d"%len(spxdata) # print the length, after encoding. Can't exceed 255!
spxlist.append(spxdata)
a=np.frombuffer(data,np.int16) # convert to numpy array to check for silence or audio
audiolevel=np.std(a)
if audiolevel < threshold: # too quiet
nover+=1
else:
nover=0
if nover >= nquit:
keepgoing=False
print '%2.1f (%d%% quiet)'%(audiolevel, nover*100/nquit)
print "Too quiet. I'm stopping now."
stream.stop_stream()
stream.close()
p.terminate()
fullspx=''.join(spxlist) # make a string of all the header-ized speex packets
out_file = open("test.spx","wb")
out_file.write(fullspx)
out_file.close()
As you can see I slightly modify the script to make it write and output file in .spx, but it dosen't work.
Any advice?
Thanks for your help.
Edit:
I'm running this script under an Ubuntu-linux machine.