problem in put the content of h5 file into a dataframe - python

I have h5 file contains nested data structure as following:
Measures/Gait/Joint/Back/Abduction/Maximum
Measures/Gait/Joint/Back/Abduction/Minimum
I am looking to put the content of nested data structure of "Minimum" and "Maximum" into Dataframe (2 columns x 25 rows).
I tried to run this code, but it doesn't work:
import pandas as pd
import h5py
import hdfdict
f = h5py.File("walking5.h5",'r')
for name in f:
print(name)
res = hdfdict.load("walking5.h5")
print(res.keys())
list3 = [['Measures']['Gait']['Joint']['Back']['Abduction']['Maximum'],['Measures']['Gait']['Joint']['Back']['Abduction']['Minimum']]
df1 = []
for np in list3:
df = pd.DataFrame (str(res[np]))
df.columns = [str(np)]
df1.append(df)
I don't know why this code doesn't work.
I have received this error:
TypeError: list indices must be integers or slices, not str

Related

TypeError: cannot concatenate object of type '<class 'dict'>' when trying to concatenate Excel files

My question might be outdated but I'm sure it's not a duplicated one and sorry if it is !
I'm trying to concatenate some Excel files (stored in a folder) by using pandas.concat but I keep getting errors, like the one shown below :
CODE :
import pandas as pd
import os
def concat_excel(folder, ws=None):
data = []
for f in os.listdir(folder):
current_df = pd.read_excel(os.path.join(folder, f), sheet_name=ws, dtype=str)
current_df['Filename'] = f.split('.')[0]
data.append(current_df)
df = pd.concat(data, axis=0)
return df
concat_excel(r'test\myfolder')
ERROR :
------> df = pd.concat(data)
TypeError: cannot concatenate object of type '<class 'dict'>'; only
Series and DataFrame objs are valid
Do you know how to fix this, please ?
Feel free to propose any better way for doing this..
Any help will be appreciated !
I found a workaround !
I'm adding it here, maybe it will be helpful for people who may face the same issue :
import pandas as pd
import os
def concat_excel(folder, ws=None):
data = []
for f in os.listdir(folder):
temp = pd.concat(pd.read_excel(os.path.join(folder, f), sheet_name=ws, dtype=str))
temp['Filename'] = f.split('.')[0] #to get rid of the file's entension
data.append(temp)
df = pd.concat(data)
return df
concat_excel(r'test\myfolder')

TypeError wants Integers are getting String

This code was originally made in a .ipynb file.
I am getting the TypeError: list indices must be integers or slices, not str
can't seem to figure out how to fix this problem.
The result should be that the unix-timestamps in the dataframe get translated to (Year-Month) and the most recent date should be used as a file.
import numpy as np
import pandas as pd
import time
from datetime import datetime
import os
import re
df = pd.DataFrame()
files = os.listdir('input')
arr = [i for i in files if i.endswith('.csv') and 'export_' in i]
df = pd.DataFrame({'filename':arr})
res = []
# The code that gives the error.
for i in df.index:
unix_code = re.findall('\d+', arr[i])
for x in unix_code:
"facturatie_vzs_" + datetime.utcfromtimestamp(unix_code[x]).strftime('%Y-%m') + ".csv"
res.append(i)
Use list comprehension with and for logical and with scalars (& is bitwise AND used in arrays) and then test substring by in operator, last array pass to DataFrame constructor:
files = os.listdir('input')
#for test
#files=['export_1656662723.csv', 'export_sss1654071237.csv']
arr = [i for i in files if i.endswith('.csv') and 'export_' in i]
df = pd.DataFrame({'filename':arr})
df['timestamp'] = pd.to_datetime(df['filename'].str.extract('(\d+)',expand=False), unit='s')
print (df)
filename timestamp
0 export_1656662723.csv 2022-07-01 08:05:23
1 export_sss1654071237.csv 2022-06-01 08:13:57

Python Pandas .iloc Download columns by number

I want to download column numbers, eg 1,3,2. In the param.txt file I have only such an entry
import pandas as pd
import numpy as np
df = pd.read_csv('sample1.csv')
with open('param.txt') as f:
s = f.read()
b = df.iloc[:, [s]]
print(b.to_string(index=False))
When I start a script
raise IndexError(f".iloc requires numeric indexers, got {arr}")
IndexError: .iloc requires numeric indexers, got ['1,3,2']
How to simply change from such a form to numeric
Thank you for every help
This should work assuming f.read() returns "1,2,3"
import pandas as pd
import numpy as np
df = pd.read_csv('sample1.csv')
with open('param.txt') as f:
s = f.read() # Assuming this is a string such as "1,2,3"
s = s.split(",") # Split string to list where there are commas ["1","2","3"]
s = [int(x) for x in s] # Convert entries from string to int [1,2,3]
b = df.iloc[:, s] # No need for brackets since s is already a list
print(b.to_string(index=False))

Reading multiple excel file in python

Sample data imageI'm new to python. Trying to read multiple excel files in folder and make it separate DataFrames.
Is the below code correct?
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
os.chdir(r'/Users/try/Documents/data')
df = ([])
def readdataframe(the_list):
for element in the_list:
print(element)
df[element] = pd.read_excel(element, 'shee1')
readdataframe(["24032020_D_KWH.xlsx","25032020_D_KWH.xlsx","26032020_D_KWH.xlsx","27032020_D_KWH.xlsx"])
I get below error when I execute
TypeError: list indices must be integers or slices, not str
Changing df = ([]) with df=pd.DataFrame() should do the trick. You didn't define your df as a pandas dataframe.
After a test this is what I came up with:
import pandas as pd
import os
os.chdir(r"path to your excel files")
the_list = []
for root, dirs, files in os.walk(r"path to your excel files"):
for file in files:
if file.endswith('.xlsx'):
the_list.append(file)
def readdataframe(the_list):
df = pd.DataFrame() #define df as an empty pandas DataFrame
for element in the_list:
#print(element)
df[element] = pd.read_excel(element, squeeze=True)
return df
print(readdataframe(the_list))
Output:
file1.xlsx file2.xlsx file3.xlsx
0 1 6 11
1 2 7 12
2 3 8 13
3 4 9 14
4 5 10 15
I'm sorry but it is considered a bad practice to upload your files, and I'm not going to download it. Nothing personal, just basic digital hygiene.
Now onto the explanation.
As you may have noticed in this line
df[element] = pd.read_excel(element, squeeze=True)
I"ve added
squeeze=True
What this parameter does is convert the dataframe that was returned into a pandas Series (one-dimensional array, think of it as your general python list), because I had only 1 column in each of my files and
df[element] =
syntax sets "element" as a column name in your dataframe where you save your data. So, this way will only work if data in your file is one-dimensional (only 1 column) If not, you should probably look into pandas.concat or pandas DataFrame join depending on the uniformity of the data shape in the files and your needs.
if you are willing to get multiple dataframes instead this is what I'm suggesting.
import pandas as pd
import os
os.chdir(r"path to your excel files")
the_list = []
for root, dirs, files in os.walk(r"path to your excel files"):
for file in files:
if file.endswith('.xlsx'):
the_list.append(file)
def readdataframe(the_list):
df_dict = {}
for element in the_list:
df_dict[element] = pd.read_excel(element)
return df_dict
print(readdataframe(the_list))
This way you are getting a python dictionary(hashtable) that contains your dataframe objects as a value and filename as a key.

Why is the cdc_list getting updated after calling the function read_csv() in total_list?

# Program to combine data from 2 csv file
The cdc_list gets updated after second call of read_csv
overall_list = []
def read_csv(filename):
file_read = open(filename,"r").read()
file_split = file_read.split("\n")
string_list = file_split[1:len(file_split)]
#final_list = []
for item in string_list:
int_fields = []
string_fields = item.split(",")
string_fields = [int(x) for x in string_fields]
int_fields.append(string_fields)
#final_list.append()
overall_list.append(int_fields)
return(overall_list)
cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")
print(len(cdc_list)) #3652
total_list = read_csv("US_births_2000-2014_SSA.csv")
print(len(total_list)) #9131
print(len(cdc_list)) #9131
I don't think the code you pasted explains the issue you've had, at least it's not anywhere I can determine. Seems like there's a lot of code you did not include in what you pasted above, that might be responsible.
However, if all you want to do is merge two csvs (assuming they both have the same columns), you can use Pandas' read_csv and Pandas DataFrame methods append and to_csv, to achieve this with 3 lines of code (not including imports):
import pandas as pd
# Read CSV file into a Pandas DataFrame object
df = pd.read_csv("first.csv")
# Read and append the 2nd CSV file to the same DataFrame object
df = df.append( pd.read_csv("second.csv") )
# Write merged DataFrame object (with both CSV's data) to file
df.to_csv("merged.csv")

Categories