creating columns with continuous values on individual csv files - python

I have a large csv file which I have split into six individual files. I am using a 'for loop' to read each file and create a column
in which the values ascend by one.
whole_file=['100Hz1-raw.csv','100Hz2-raw.csv','100Hz3-raw.csv','100Hz4-raw.csv','100Hz5-raw.csv','100Hz6-raw.csv']
first_file=True
for piece in whole_file:
if not first_file:
skip_row = [0] # if it is not the first csv file then skip the header row (row 0) of that file
else:
skip_row = []
V_raw = pd.read_csv(piece)
V_raw['centiseconds'] = np.arange(len(V_raw)) #label each centisecond
My output:
My desired output
Is there a clever way of doing what I intend.

Store the last value for centiseconds and count from there:
whole_file=['100Hz1-raw.csv','100Hz2-raw.csv','100Hz3-raw.csv','100Hz4-raw.csv','100Hz5-raw.csv','100Hz6-raw.csv']
first_file=True
## create old_centiseconds variable
old_centiseconds = 0
for piece in whole_file:
if not first_file:
skip_row = [0] # if it is not the first csv file then skip the header row (row 0) of that file
else:
skip_row = []
V_raw = pd.read_csv(piece)
# add old_centiseconds onto what you had before
V_raw['centiseconds'] = np.arange(len(V_raw)) + old_centiseconds #label each centisecond
# update old_centiseconds
old_centiseconds += len(V_raw)

As I said in my comment you may want to view the data as a numpy array as this requires less memory. You can this by opening the .csv files as numpy array and then append to an empty list. If you would like to append these numpy arrays together you can .vstack. The following code should be able to do this:
from numpy import genfromtxt
whole_file=['100Hz1-raw.csv','100Hz2-raw.csv','100Hz3-raw.csv','100Hz4-raw.csv','100Hz5-raw.csv','100Hz6-raw.csv']
whole_file_numpy_array = []
for file_name in whole_file:
my_data = genfromtxt(file_name, delimiter=',')
whole_file_numpy_array.append(file_name)
combined_numpy_array = np.vstack(whole_file_numpy_array)

Related

How to use python to seperate a one column CSV file if the columns have no headings, then save this into a new excel file?

So, I am quite new to python and have been googling a lot but have not found a good solution. What I am looking to do is automate text to columns using python in an excel document without headers.
Here is the excel sheet I have
it is a CSV file where all the data is in one column without headers
ex. hi ho loe time jobs barber
jim joan hello
009 00487 08234 0240 2.0348 20.34829
delimeter is space and comma
What I want to come out is saved in another excel with the first two rows deleted and seperated into columns
( this can be done using text to column in excel but i would like to automate this for several excel sheets)
009 | 00487 | 08234 | 0240 | 2.0348 | 20.34829
the code i have written so far is like this:
import pandas as pd
import csv
path = 'C:/Users/ionan/OneDrive - Universiteit Utrecht/Desktop/UCU/test_excel'
os.chdir(path)
for root, dirs, files in os.walk(path):
for f in files:
df = pd.read_csv(f, delimiter='\t' + ';', engine = 'python')
Original file with name as data.xlsx:
This means all the data we need is under the column Data.
Code to split data into multiple columns for a single file:
import pandas as pd
import numpy as np
f = 'data.xlsx'
# -- Insert the following code in your `for f in files` loop --
file_data = pd.read_excel(f)
# Since number of values to be split is not known, set the value of `num_cols` to
# number of columns you expect in the modified excel file
num_cols = 20
# Create a dataframe with twenty columns
new_file = pd.DataFrame(columns = ["col_{}".format(i) for i in range(num_cols)])
# Change the column name of the first column in new_file to "Data"
new_file = new_file.rename(columns = {"col_0": file_data.columns[0]})
# Add the value of the first cell in the original file to the first cell of the
# new excel file
new_file.loc[0, new_file.columns[0]] = file_data.iloc[0, 0]
# Loop through all rows of original excel file
for index, row in file_data.iterrows():
# Skip the first row
if index == 0:
continue
# Split the row by `space`. This gives us a list of strings.
split_data = file_data.loc[index, "Data"].split(" ")
print(split_data)
# Convert each element to a float (a number) if we want numbers and not strings
# split_data = [float(i) for i in split_data]
# Make sure the size of the list matches to the number of columns in the `new_file`
# np.NaN represents no value.
split_data = [np.NaN] + split_data + [np.NaN] * (num_cols - len(split_data) - 1)
# Store the list at a given index using `.loc` method
new_file.loc[index] = split_data
# Drop all the columns where there is not a single number
new_file.dropna(axis=1, how='all', inplace=True)
# Get the original excel file name
new_file_name = f.split(".")[0]
# Save the new excel file at the same location where the original file is.
new_file.to_excel(new_file_name + "_modified.xlsx", index=False)
This creates a new excel file (with a single sheet) of name data_modified.xlsx:
Summary (code without comments):
import pandas as pd
import numpy as np
f = 'data.xlsx'
file_data = pd.read_excel(f)
num_cols = 20
new_file = pd.DataFrame(columns = ["col_{}".format(i) for i in range(num_cols)])
new_file = new_file.rename(columns = {"col_0": file_data.columns[0]})
new_file.loc[0, new_file.columns[0]] = file_data.iloc[0, 0]
for index, row in file_data.iterrows():
if index == 0:
continue
split_data = file_data.loc[index, "Data"].split(" ")
split_data = [np.NaN] + split_data + [np.NaN] * (num_cols - len(split_data) - 1)
new_file.loc[index] = split_data
new_file.dropna(axis=1, how='all', inplace=True)
new_file_name = f.split(".")[0]
new_file.to_excel(new_file_name + "_modified.xlsx", index=False)

Extracting all specific rows (separately) from multiple csv files and combine rows to save as a new file

I have a number of csv files. I need to extract all respective rows from each file and save it as a new file.
i.e. first output file must contain first rows of all input files and so on.
I have done the following.
import pandas as pd
import os
import numpy as np
data = pd.DataFrame('', columns =['ObjectID', 'SPI'], index = np.arange(1,100))
path = r'C:\Users\bikra\Desktop\Pandas'
i = 1
for files in os.listdir(path):
if files[-4:] == '.csv':
for j in range(0,10, 1):
#print(files)
dataset = pd.read_csv(r'C:\Users\bikra\Desktop\Pandas'+'\\'+files)
spi1 = dataset.loc[j,'SPI']
data.loc[i]['ObjectID'] = files[:]
data.loc[i]['SPI'] = spi1
data.to_csv(r'C:\Users\bikra\Desktop\Pandas\output\\'+str(j)+'.csv')
i + 1
It works well when index (i.e. 'j' ) is specified. But when I tried to loop, the output csv file contains only first row. Where am I wrong?
You better use append:
data = data.append(spi1)

use numpy to transpose the list of lists so it can be uploaded into gsheet

I am looking to import some data into a google sheet, and am using numpy to transpose the list of lists so it can be uploaded into gsheet.
When i run the script i get the following error
IndexError: index 0 is out of bounds for axis 0 with size 0
def update_sheet(ws, rows, left=1, top=2):
"""
updates the google spreadsheet with given table
- ws is gspread.models.Worksheet object
- rows is a table (list of lists)
- left is the number of the first column in the target document (beginning with 1)
- top is the number of first row in the target document (beginning with 1)
"""
# number of rows and columns
num_lines, num_columns = len(rows), len(rows[0])
# selection of the range that will be updated
cell_list = ws.range(
colrow_to_A1(left,top)+':'+colrow_to_A1(left+num_columns-1, top+num_lines-1)
)
# modifying the values in the range
for cell in cell_list:
val = rows[cell.row-top][cell.col-left]
cell.value = val
# update in batch
ws.update_cells(cell_list, value_input_option='USER_ENTERED')
# read csv into pandas to manipulate the layout and edit the column headings and remove euro symbols
df = pd.read_csv(file)
df = df.iloc[2:] # remove the first 2 rows from the dataframe
#rename the df columns as it reads them in really wierd
#df['Amount'] = df['Amount'].str[2:]
df.to_csv(file, index=False)
# read in the CSV as a list of lists
with open(file) as csvfile:
rows = csv.reader(csvfile)
res = list(map(list, zip(*rows)))
# print(*res)
# delete the first column of the transposed list of lists - this means we no longer have the headers (couldn't work out how to transpose with map, list, zip, etc.)
for row in res:
del row[0]
# use numpy to transpose the list of lists so it can be uploaded into gsheet
res2 = np.array(res).T.tolist()
res2 = np.delete(res2, (0), axis=0) # removes top row from array (title row from csv export)
res2 = res2[:-3, :] # removes last 3 rows from array (superfluous export details from export)
ws = worksheet
update_sheet(ws, res2)

Pandas - Overwrite single column with new values, retain additional columns; overwrite original files

Fairly new to python, I have a csv with 2 columns, I need the code to perform a simple calculation on the first column while retaining the information in the second. code currently performs the calculation(albeit only on the first csv in the list, and there are numerous). But I haven't figured out how to overwrite the values in each file while retaining the second column unchanged. I'd like it to save over the original files with the new calculations. Additionally, originals have no header, and pandas automatically assigns a numeric value.
import os
import pandas as pd
def find_csv(topdir, suffix='.csv'):
filenames = os.listdir(topdir)
csv_list = [name for name in filesnames if name.endswith(suffix)
fp_list = []
for csv in csv_list:
fp = os.path.join(topdir, csv)
fp_list.append(fp)
return fp_list
def wn_to_um(wn):
um = 10000/wn
return um
for f in find_csv('C:/desktop/test'):
readit = pd.read_csv(f, usecols=[0])
convert = wn_to_um(readit)
df = pd.DataFram(convert)
df.to_csv('C:/desktop/test/whatever.csv')
I suppose you just have to do minor changes to your code.
def wn_to_um(wn):
wn.iloc[:,0] = 10000/wn.iloc[:,0] #performing the operation on the first column
return wn
for f in find_csv('C:/desktop/test'):
readit = pd.read_csv(f) #Here read the whole file
convert = wn_to_um(readit) #while performing operation, just call the function with the second column
os.remove(f) #if you want to replace the existing file with the updated calculation, simply delete and write
df.to_csv('C:/desktop/test/whatever.csv')
Say you have a column named 'X' which you want to divide by 10,000. You can store this as X and then divide each element in X like so:
X = df['X']
new_x = [X / 10000 for i in X]
From here, rewriting the column in the dataframe is very simple:
df['X'] = new_x
Just update your second function as:
def wn_to_um(wn):
wn.iloc[:,0] = 10000/wn.iloc[:,0]
return wn

How to split a log file into several csv files with python

I'm pretty new to python and coding in general, so sorry in advance for any dumb questions. My program needs to split an existing log file into several *.csv files (run1,.csv, run2.csv, ...) based on the keyword 'MYLOG'. If the keyword appears it should start copying the two desired columns into the new file till the keyword appears again. When finished there need to be as many csv files as there are keywords.
53.2436 EXP MYLOG: START RUN specs/run03_block_order.csv
53.2589 EXP TextStim: autoDraw = None
53.2589 EXP TextStim: autoDraw = None
55.2257 DATA Keypress: t
57.2412 DATA Keypress: t
59.2406 DATA Keypress: t
61.2400 DATA Keypress: t
63.2393 DATA Keypress: t
...
89.2314 EXP MYLOG: START BLOCK scene [specs/run03_block01.csv]
89.2336 EXP Imported specs/run03_block01.csv as conditions
89.2339 EXP Created sequence: sequential, trialTypes=9
...
[EDIT]: The output per file (run*.csv) should look like this:
onset type
53.2436 EXP
53.2589 EXP
53.2589 EXP
55.2257 DATA
57.2412 DATA
59.2406 DATA
61.2400 DATA
...
The program creates as much run*.csv as needed, but i can't store the desired columns in my new files. When finished, all I get are empty csv files. If I shift the counter variable to == 1 it creates just one big file with the desired columns.
Thanks again!
import csv
QUERY = 'MYLOG'
with open('localizer.log', 'rt') as log_input:
i = 0
for line in log_input:
if QUERY in line:
i = i + 1
with open('run' + str(i) + '.csv', 'w') as output:
reader = csv.reader(log_input, delimiter = ' ')
writer = csv.writer(output)
content_column_A = [0]
content_column_B = [1]
for row in reader:
content_A = list(row[j] for j in content_column_A)
content_B = list(row[k] for k in content_column_B)
writer.writerow(content_A)
writer.writerow(content_B)
Looking at the code there's a few things that are possibly wrong:
the csv reader should take a file handler, not a single line.
the reader delimiter should not be a single space character as it looks like the actual delimiter in your logs is a variable number of multiple space characters.
the looping logic seems to be a bit off, confusing files/lines/rows a bit.
You may be looking at something like the code below (pending clarification in the question):
import csv
NEW_LOG_DELIMITER = 'MYLOG'
def write_buffer(_index, buffer):
"""
This function takes an index and a buffer.
The buffer is just an iterable of iterables (ex a list of lists)
Each buffer item is a row of values.
"""
filename = 'run{}.csv'.format(_index)
with open(filename, 'w') as output:
writer = csv.writer(output)
writer.writerow(['onset', 'type']) # adding the heading
writer.writerows(buffer)
current_buffer = []
_index = 1
with open('localizer.log', 'rt') as log_input:
for line in log_input:
# will deal ok with multi-space as long as
# you don't care about the last column
fields = line.split()[:2]
if not NEW_LOG_DELIMITER in line or not current_buffer:
# If it's the first line (the current_buffer is empty)
# or the line does NOT contain "MYLOG" then
# collect it until it's time to write it to file.
current_buffer.append(fields)
else:
write_buffer(_index, current_buffer)
_index += 1
current_buffer = [fields] # EDIT: fixed bug, new buffer should not be empty
if current_buffer:
# We are now out of the loop,
# if there's an unwritten buffer then write it to file.
write_buffer(_index, current_buffer)
You can use pandas to simplify this problem.
Import pandas and read in log file.
import pandas as pd
df = pd.read_fwf('localizer2.log', header=None)
df.columns = ['onset', 'type', 'event']
df.set_index('onset', inplace=True)
Set Flag where third column == 'MYLOG'
df['flag'] = 0
df.loc[df.event.str[:5] == 'MYLOG', 'flag'] = 1
df.flag = df['flag'].cumsum()
Save each run as a separate run*.csv file
for i in range(1, df.flag.max()+1):
df.loc[df.flag == i, 'event'].to_csv('run{0}.csv'.format(i))
EDIT:
Looks like your format is different than I originally assumed. Changed to use pd.read_fwf. my localizer.log file was a copy and paste of your original data, hope this works for you. I assumed by the original post that it did not have headers. If it does have headers then remove header=None and df.columns = ['onset', 'type', 'event'].

Categories