how to convert pandas dataframe to binary file in python

how to convert pandas dataframe to binary file in python - python

import numpy as np
import pandas as pd
def get_values_for_frequency(freq):
# sampling information
Fs = 100# sample rate no of samppes per second
T = 1/Fs # sampling period %sample per second
t = 1 # seconds of sampling
N = Fs*t # total points in signal
# signal information
#freq = 100 # in hertz,
omega = 2*np.pi*freq # angular frequency for sine waves
t_vec = np.arange(N)*T # time vector for plotting
y = np.sin(omega*t_vec)
return y
df = pd.DataFrame(columns =['1Hz','2Hz', '3Hz', '4Hz', '5Hz', '6Hz', '7Hz'])
df['1Hz']=pd.Series(get_values_for_frequency(1))
df['2Hz']=pd.Series(get_values_for_frequency(2))
df['3Hz']=pd.Series(get_values_for_frequency(3))
df['4Hz']=pd.Series(get_values_for_frequency(4))
df['5Hz']=pd.Series(get_values_for_frequency(5))
df['6Hz']=pd.Series(get_values_for_frequency(6))
df['7Hz']=pd.Series(get_values_for_frequency(7))
#df.to_csv('samplepersecond.csv')
ndary=df.to_records(index=False)
This is the code to generate a sine wave .Here I generated a sine wave with 7 columns(from 1 Hz to 7 Hz) and with 100 rows. Then I created a pandas Dataframe to store all these values. Now , the requirement is to convert this Dataframe into binary file with datatype of int16. So each value in a dataframe should be converted into 16 bit signed integer and then to convert into binary file

You can convert your data frame values to int16 by using the astype function.
import numpy as np
df = df.astype(np.int16)
Then you can save your data frame in HDF5 format by using to_hdf.
df.to_hdf('tmp.hdf','df', mode='w')

Related

Generating and Storing Samples of an Exponential Distribution with a name for each sample using a loop

I've got a weird question for a class project. Assuming X ~ Exp(Lambda), Lambda=1.6, I have to generate 100 samples of X, with the indices corresponding to the sample size of each generated sample (S1, S2 ... S100). I've worked out a simple loop which generate the required samples in array, but i am not able to rename the array.
First attempt:
import numpy as np
import matplotlib.pyplot as plt
samples = []
for i in range(1,101,1):
samples.append(np.random.exponential(scale= 1/1.6, size= i))
Second attempt:
import numpy as np
import matplotlib.pyplot as plt
for i in range(1,101,1):
samples = np.random.exponential(scale= 1/1.2, size= i)
col = f'samples {i}'
df_samples[col] = exponential_sample
df_samples = pd.DataFrame(samples)
An example how I would like to visualize the data:
# drawing 50 random samples of size 2 from the exponentially distributed population
sample_size = 2
df2 = pd.DataFrame(index= ['x1', 'x2'] )
for i in range(1, 51):
exponential_sample = np.random.exponential((1/rate), sample_size)
col = f'sample {i}'
df2[col] = exponential_sample
# Taking a peek at the samples
df2
But instead of having a simple size = 2, I would like to have sample size = i. This way, I will be able to generate 1 rows for the first column (S1), 2 rows for the second column (S2), until I reach 100 rows for the 100th column (S100).

You cannot stick vectors of different lengths easily into a df so your mock-up code would not work, but you can concat one vector at a time:
df = pd.DataFrame()
for i in range(100,10100,100):
tmp = pd.DataFrame({f'S{i}':np.random.exponential(scale= 1/1.2, size= i)})
df = pd.concat([df, tmp], axis=1)

Use a dict instead maybe?
samples = {}
for i in range(100,10100,100):
samples[i] = np.random.exponential(scale= 1/1.2, size= i)
Then you can convert it into a pandas Dataframe if you like.

Generate values in separate dataframe

I trying to generate random data with Pandas.
Data is need to be stored in two columns. The first column needs to contain categorical variables (from Stratum_1 until Stratum_19) each of these stratums can contain a random number of values.
Second column needs to have data in the range between 1 to 180000000 with a standard deviation of 453210, a mean of 170000, and a number of rows 100000.
I try to
categorical = {'name': ['Stratum_1','Stratum_2','Stratum_3','Stratum_4','Stratum_5','Stratum_6','Stratum_7','Stratum_8','Stratum_9',
'Stratum_10','Stratum_11','Stratum_12','Stratum_13','Stratum_14','Stratum_15','Stratum_16','Stratum_17','Stratum_18','Stratum_19']}
desired_mean = 170000
desired_std_dev = 453210
df = pd.DataFrame(np.random.randint(0,180000000,size=(100000, 1)),columns=list('1'))
I tried with this code above but don't know how to implement categorical and numerical values together with desired mean and standard deviation. So can anybody help how to solve this problem and generate?

I decided to use the gamma distribution to generate your desired sample after thinking that the given parameters are not suitable for the normal distribution.
Code
import numpy as np
import pandas as pd
# desired parameters
n_rows = 100000
lower, upper = 1, 180000000
mu, sigma = 170000, 453210
# amount of shift
delta = lower
# parameters for the gamma distribution
shape = ((mu - delta) / sigma) ** 2
scale = sigma**2 / (mu - delta)
# Create a dataframe
categories = {'name': [f'Stratum_{i}' for i in range(1, 19 + 1)]}
df = pd.DataFrame(categories).sample(n=n_rows, replace=True).reset_index(drop=True)
# Generate samples along with your desired parameters
generator = np.random.default_rng()
while True:
df['value'] = generator.gamma(shape=shape, scale=scale, size=n_rows) + delta
if df.value.max() <= upper:
break
# Show statistics
print(df.describe())
Output
value
count
100,000
mean
169,403 (Target: 170,000)
std
449,668 (Target: 453,210)
min
1
25%
39.4267
50%
5529.28
75%
105,748
max
9.45114e+06

Try:
import numpy as np
categorical = {'name': ['Stratum_1','Stratum_2','Stratum_3','Stratum_4','Stratum_5','Stratum_6','Stratum_7','Stratum_8','Stratum_9',
'Stratum_10','Stratum_11','Stratum_12','Stratum_13','Stratum_14','Stratum_15','Stratum_16','Stratum_17','Stratum_18','Stratum_19']}
desired_mean = 170000
desired_std_dev = 453210
df = pd.DataFrame({'num':np.random.normal(170000, 453210,size=(300000, 1)).reshape(-1), 'cat':np.random.choice(categorical['name'], 300000)})
df[(0<df['num'])&(df['num']<180000000)].sample(100000)
result:

Can anyone guide me on how to calculate the frequency in hz from wav file? The wave file is of 50 secs

I am using numpy library to calculate freq = np.fft.rfftfreq(len_data, 1.0 / rate) , If I am not wrong then this frequency is without unit . How can i convert it into hertz. I am using the following code :
import numpy as np
import scipy.io.wavfile
from scipy import signal
def read_wav_file(file_name):
sample_rate, Data_audio = scipy.io.wavfile.read(file_name)
return sample_rate, Data_audio
def getFFT(Data_audio, sample_rate):
len_data = len(Data_audio)
Data_audio = Data_audio * np.hamming(len_data)
fft = np.fft.rfft(Data_audio)
fft = np.abs(fft)
ret_len_FFT = len(fft)
freq = np.fft.rfftfreq(len_data, 1.0 / sample_rate)
return ( freq[:int(len(freq))], fft[:int(ret_len_FFT)], ret_len_FFT )
sample_rate_rec, Data_audio_rec = read_wav_file('2020rec.wav')
frequency_rec, fft_rec, ret_lenFFT_rec = getFFT(Data_audio_rec, sample_rate_rec)
print("frequency_rec: " + str(frequency_rec) )
**Output**
frequency_rec: [0.00000000e+00 8.33481508e-03 1.66696302e-02 ... 2.39999833e+04
2.39999917e+04 2.40000000e+04]
# To convert frequencies into float format
np.set_printoptions(formatter={'float':'{:f}'.format})
print("frequency_rec: " + str(frequency_rec) )
**Output**
frequency_rec: [0.000000 0.008307 0.016614 ... 23999.983386 23999.991693 24000.000000]

The numpy FFT package has a built-in function to calculate the frequency vector to go along with your FFT output. Note that scipy outputs the sample rate wile numpy wants the sample spacing so you must invert it first.
import numpy as np
def getFrequencies(Data_audio, sample_rate_rec):
return np.fft.fftfreq(n=len(Data_audio), d=1/sample_rate)
Since the output of scipy.io.wavfile.read claims to be in samples/sec, the output of np.fft.fftfreq will be in cycles/sec aka Hertz.

Downsampling signal from 100.21 Hz to 8 Hz (non-integer decimation factor)

I have found the following method to downsample a signal in python. I would like to use this method with a sample_rate of 100.21 but I think currently it only works for integer powers of two. Is there a possibility to downsample my signal with frequency 100.21 Hz to 8 Hz?
def interpolateDataTo8Hz(data,sample_rate,startTime):
# Downsample
idx_range = range(0,len(data))
data = data.iloc[idx_range[0::int(sample_rate)/8]]
# Set the index to be 8Hz
data.index = pd.DatetimeIndex(start=startTime,periods = len(data),freq='125L')
# Interpolate all empty values
data = interpolateEmptyValues(data)
return data
def interpolateEmptyValues(data):
cols = data.columns.values
for c in cols:
data[c] = data[c].interpolate()
return data

Python program uses too much memory

Function coinT() tests if two time series are stationary using ADF test and Hurst exponent. Time series are stored in 1511x6 CSV files, but for testing only a vector of the 5th column is returned by the function stock(). There are 50 files in total. It seems that the program is using too much memory as it makes the PC crash after running for ~30 secs. It works fine on 15 files, but crashes on larger sets(>50).
Can somebody please help me find out what's using so much memory? I've tried splitting computations into multiple functions and deleting the object, but it didn't help much.
import numpy as np
import pandas as pd
import statsmodels.tsa.stattools as ts
import csv
import timeit
from numpy import log, polyfit, sqrt, std, subtract
from pandas.stats.api import ols
import os
src = 'C:/Users/PC/Desktop/Magistr/Ibpython/testing/'
filenames = next(os.walk(src))[2] #load all stock file names into array
cointegratedPairs = []
def hurst(ts):
"""Returns the Hurst Exponent of the time series vector ts
H<0.5 - The time series is mean reverting
H=0.5 - The time series is a Geometric Brownian Motion
H>0.5 - The time series is trending"""
# Create the range of lag values
lags = range(2, 100)
# Calculate the array of the variances of the lagged differences
tau = [sqrt(std(subtract(ts[lag:], ts[:-lag]))) for lag in lags]
# Use a linear fit to estimate the Hurst Exponent
poly = polyfit(log(lags), log(tau), 1)
del lags
del tau
# Return the Hurst exponent from the polyfit output
return poly[0]*2.0
#Convert file into an array
def stock(filename):
#read file into array and get it's length
delimiter = ","
with open(src + filename,'r') as dest_f:
data_iter = csv.reader(dest_f,
delimiter = delimiter,
quotechar = '"')
data = [data for data in data_iter]
data_array = np.asarray(data)[:,5]
return data_array
del data
del data_array
#Check if two time series are cointegrated
def coinTest(itemX, itemY):
indVar = map(float, stock(itemX)[0:1000]) #2009.05.22 - 2013.05.14
depVar = map(float, stock(itemY)[0:1000]) #2009.05.22 - 2013.05.14
#Calculate optimal hedge ratio "beta"
df = pd.DataFrame()
df[itemX] = indVar
df[itemY] = depVar
res = ols(y=df[itemY], x=df[itemX])
beta_hr = res.beta.x
alpha = res.beta.intercept
df["res"] = df[itemY] - beta_hr*df[itemX] - alpha
#Calculate the CADF test on the residuals
cadf = ts.adfuller(df["res"])
#Reject the null hypothesis at 1% confidence level
if cadf[4]['1%'] > cadf[0]:
#Hurst exponent test if residuals are mean reverting
if hurst(df["res"]) < 0.4:
cointegratedPairs.append((itemY,itemX))
del indVar
del depVar
del df[itemX]
del df[itemY]
del df["res"]
del cadf
#Main function
def coinT():
limit = 0
TotalPairs = 0
for itemX in filenames:
for itemY in filenames[limit:]:
TotalPairs +=1
if itemX == itemY:
next
else:
coinTest(itemX, itemY)
limit +=1

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to convert pandas dataframe to binary file in python - python

You can convert your data frame values to int16 by using the astype function. import numpy as np df = df.astype(np.int16) Then you can save your data frame in HDF5 format by using to_hdf. df.to_hdf('tmp.hdf','df', mode='w')

Related

Generating and Storing Samples of an Exponential Distribution with a name for each sample using a loop

Generate values in separate dataframe

Can anyone guide me on how to calculate the frequency in hz from wav file? The wave file is of 50 secs

Downsampling signal from 100.21 Hz to 8 Hz (non-integer decimation factor)

Python program uses too much memory

Categories

Resources