How to add columns to an existing csv file? - python

I have this code that reads through my csv files ( p01_results, p02_results, ..... ) to remove some unwanted rows based on its number from, and it works. Right now I trying to add two columns participantID and session. For participantID I tried to read the name of the csv file, save the ID number (01,02, ...) and fill the column with it. For session, I tried to fill every 18 rows with 1s, 2s, 3s and 4s.
I tried to use this code into mine, but didn't work:
test4 = ['test4', 4, 7, 10]
with open(data.csv, 'r') as ifile
with open(adjusted.csv, 'w') as ofile:
for line, new in zip(ifile, test4):
new_line = line.rstrip('\n') + ',' + str(new) + '\n'
ofile.write(new_line)
import os
base_directory = 'C:\\Users\\yosal\\Desktop\\results'
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a CSV file
if not file_name.endswith('results.csv'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
with open(file_path, 'r') as ifile:
line_list = ifile.readlines()
with open(file_path, 'w') as ofile:
# only write these rows to the new file
ofile.writelines(line_list[0])
ofile.writelines(line_list[2:20])
ofile.writelines(line_list[21:39])
ofile.writelines(line_list[40:58])
ofile.writelines(line_list[59:77])

Try reading the CSV into a list. Then, loop through each element of the list (each element being a row in the CSV), and add a string with the delimieter plus the desired string. Then, write a new CSV, either named differently or replacing the old one, and just use your list as the input.

I tried adding a column to my csv file using pandas. So you can try out something like this. First you have to install pandas by running "pip install pandas".
import pandas as pd
df = pd.read_csv('data.csv') ## read the csv file
df.set_index('S/N', inplace=True) ## you can set an index with any column
##you have that already exists in your csv
##in my case it is the "S/N" column i used
df["test"] = ["values","you want","add"]
df.to_csv('data.csv')

Took me some time, but I did it.
import os
base_directory = 'C:\\Users\\yosal\\Desktop\\results'
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a CSV file
if not file_name.endswith('results.csv'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
with open(file_path, 'r') as ifile:
line_list = ifile.readlines()
with open(file_path, 'w') as ofile:
ofile.writelines(str(line_list[0]).rstrip()+",participant,session\n")
for x in range(2, 20):
ofile.writelines(str(line_list[x]).rstrip()+","+file_path[len(base_directory)+2:len(base_directory)+4]+",1\n")
for y in range(21, 39):
ofile.writelines(str(line_list[y]).rstrip()+","+file_path[len(base_directory)+2:len(base_directory)+4]+",2\n")
for h in range(40, 58):
ofile.writelines(str(line_list[h]).rstrip()+","+file_path[len(base_directory)+2:len(base_directory)+4]+",3\n")
for z in range(59 ,77):
ofile.writelines(str(line_list[z]).rstrip()+","+file_path[len(base_directory)+2:len(base_directory)+4]+",4\n")

Related

How to filter columns within a .CSV file and then save those filtered columns to a new .CSV file in Python?

I am analyzing a large weather data file, Data.csv. I need to write a program in Python that will filter the Data.csv file and keep the following columns only: STATION, NAME/LOCATION, DATE, AWND, SNOW. Then save the filtered file and name it filteredData.csv.
I am using Python 3.8. I have only been able to somewhat figure out how to filter the columns I need within a print function. How do I filter this file and then save the filtered file?
import csv
filename = 'Data.csv'
f = open(filename, 'rt')
reader = csv.reader(f,delimiter=',')
for column in reader:
print(column[0] + "," + column[1] + "," + column[2] + "," + column[3] + "," + column[4] + "," + column[13])
A small section of the Data.csv file
It can be quickly done using Pandas
import pandas as pd
weather_data = pd.read_csv('Data.csv')
filtered_weather = weather_data[['Column_1','Column_1']] #Select the column names that you want
filtered_weather.to_csv('new_file',index=False)
If you're running this under windows you can simply run the code you already wrote with "> newfile.csv" at the end of the command to plug output into a test file.
If you want to do it within the code though:
import csv
new_filename = 'Reduced_Data.csv'
filename = 'Data.csv'
f = open(filename, 'rt')
reader = csv.reader(f,delimiter=',')
for row in reader:
with open(new_filename, 'a') as output:
output.write('"{}","{}","{}","{}","{}","{}"\n'.format(column[0],column[1],column[2],column[3],column[4],column[13]))
check out the CSV reader and this example. you can do something like:
import csv
content = []
with open('Data.csv', 'r') as file:
reader = csv.reader(file, delimiter = ','))
for row in reader:
content.append(row)
print(content)
## now writing them in a file:
with open('filteredData.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['STATION', 'NAME LOCATION', 'DATE', 'AWND', 'SNOW'])
for i in range(1, len(content)):
writer.writerow[content[i][0], content[i][1], content[i][2], content[i][3], content[i][13]) ## i left out some columns, so they will not be in the file later, maybe I did not get that right.
but honestly, I would use this approach but that means only copy & paste.

Pulling out data from CSV files' specific columns in Python

I need a quick help with reading CSV files using Python and storing it in a 'data-type' file to use the data to graph after storing all the data in different files.
I have searched it, but in all cases I found, there was headers in the data. My data does not header part. They are tab separated. And I need to store only specific columns of the data. Ex:
12345601 2345678#abcdef 1 2 365 places
In this case, as an example, I would want to store only "2345678#abcdef" and "365" in the new python file in order to use it in the future to create a graph.
Also, I have more than 1 csv file in a folder and I need to do it in each of them. The sources I found did not talk about it and only referred to:
# open csv file
with open(csv_file, 'rb') as csvfile:
Could anyone refer me to already answered question or help me out with it?
. . . and storing it in a PY file to use the data to graph after storing all the data in different files . . .
. . . I would want to store only "2345678#abcdef" and "365" in the new python file . . .
Are you sure that you want to store the data in a python file? Python files are supposed to hold python code and they should be executable by the python interpreter. It would be a better idea to store your data in a data-type file (say, preprocessed_data.csv).
To get a list of files matching a pattern, you can use python's built-in glob library.
Here's an example of how you could read multiple csv files in a directory and extract the desired columns from each one:
import glob
# indices of columns you want to preserve
desired_columns = [1, 4]
# change this to the directory that holds your data files
csv_directory = '/path/to/csv/files/*.csv'
# iterate over files holding data
extracted_data = []
for file_name in glob.glob(csv_directory):
with open(file_name, 'r') as data_file:
while True:
line = data_file.readline()
# stop at the end of the file
if len(line) == 0:
break
# splits the line by whitespace
tokens = line.split()
# only grab the columns we care about
desired_data = [tokens[i] for i in desired_columns]
extracted_data.append(desired_data)
It would be easy to write the extracted data to a new file. The following example shows how you might save the data to a csv file.
output_string = ''
for row in extracted_data:
output_string += ','.join(row) + '\n'
with open('./preprocessed_data.csv', 'w') as csv_file:
csv_file.write(output_string)
Edit:
If you don't want to combine all the csv files, here's a version that can process one at a time:
def process_file(input_path, output_path, selected_columns):
extracted_data = []
with open(input_path, 'r') as in_file:
while True:
line = in_file.readline()
if len(line) == 0: break
tokens = line.split()
extracted_data.append([tokens[i] for i in selected_columns])
output_string = ''
for row in extracted_data:
output_string += ','.join(row) + '\n'
with open(output_path, 'w') as out_file:
out_file.write(output_string)
# whenever you need to process a file:
process_file(
'/path/to/input.csv',
'/path/to/processed/output.csv',
[1, 4])
# if you want to process every file in a directory:
target_directory = '/path/to/my/files/*.csv'
for file in glob.glob(target_directory):
process_file(file, file + '.out', [1, 4])
Edit 2:
The following example will process every file in a directory and write the results to a similarly-named output file in another directory:
import os
import glob
input_directory = '/path/to/my/files/*.csv'
output_directory = '/path/to/output'
for file in glob.glob(input_directory):
file_name = os.path.basename(file) + '.out'
out_file = os.path.join(output_directory, file_name)
process_file(file, out_file, [1, 4])
If you want to add headers to the output, then process_file could be modified like this:
def process_file(input_path, output_path, selected_columns, column_headers=[]):
extracted_data = []
with open(input_path, 'r') as in_file:
while True:
line = in_file.readline()
if len(line) == 0: break
tokens = line.split()
extracted_data.append([tokens[i] for i in selected_columns])
output_string = ','.join(column_headers) + '\n'
for row in extracted_data:
output_string += ','.join(row) + '\n'
with open(output_path, 'w') as out_file:
out_file.write(output_string)
Here's another approach using a namedtuple that will help extract selected fields from a csv file and then let you write them out to a new csv file.
from collections import namedtuple
import csv
# Setup named tuple to receive csv data
# p1 to p5 are arbitrary field names associated with the csv file
SomeData = namedtuple('SomeData', 'p1, p2, p3, p4, p5, p6')
# Read data from the csv file and create a generator object to hold a reference to the data
# We use a generator object rather than a list to reduce the amount of memory our program will use
# The captured data will only have data from the 2nd & 5th column from the csv file
datagen = ((d.p2, d.p5) for d in map(SomeData._make, csv.reader(open("mydata.csv", "r"))))
# Write the data to a new csv file
with open("newdata.csv","w", newline='') as csvfile:
cvswriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
# Use the generator created earlier to access the filtered data and write it out to a new csv file
for d in datagen:
cvswriter.writerow(d)
Original Data in "mydata.csv":
12345601,2345678#abcdef,1,2,365,places
4567,876#def,0,5,200,noplaces
Output Data in "newdata.csv":
2345678#abcdef,365
876#def,200
EDIT 1:
For tab delimited data make the following changes to the code:
change
datagen = ((d.p2, d.p5) for d in map(SomeData._make, csv.reader(open("mydata.csv", "r"))))
to
datagen = ((d.p2, d.p5) for d in map(SomeData._make, csv.reader(open("mydata2.csv", "r"), delimiter='\t', quotechar='"')))
and
cvswriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
to
cvswriter = csv.writer(csvfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL)

Deleting "string" containing last rows from CSV file using regex

I am new to Python. I have thousands of CSV files, in which, there is a group of text that comes after the numeric data are logged and I would like to remove all the rows onwards that begin with text. For example:
col 1 col 2 col 3
--------------------
10 20 30
--------------------
45 34 56
--------------------
Start 8837sec 9items
--------------------
Total 6342sec 755items
The good thing is that the text for all the csv files begin with "Start" in column1. I would prefer removing all the rows afterwards including the row that says "Start".
Here is what I have so far:
import csv, os, re, sys
fileList = []
pattern = [r"\b(Start).*", r"\b(Total).*"]
for file in files:
fullname = os.path.join(cwd, file)
if not os.path.isdir(fullname) and not os.path.islink(fullname):
fileList.append(fullname)
for file in fileList:
try:
ifile = open(file, "r")
except IOError:
sys.stderr.write("File %s not found! Please check the filename." %(file))
sys.exit()
else:
with ifile:
reader = csv.reader(ifile)
writer = csv.writer(ifile)
rowList = []
for row in reader:
rowList.append((", ".join(row)))
for pattern in word_pattern:
if not (re.match(pattern, rowList)
writer.writerow(elem)
After running this script, it gives me blank csv file. Any idea what to change?
You don't need the CSV reader for this. You could simply find the offset and truncate the file. Open the file in binary mode and use a multi-line regex to find the pattern in the text and use its index.
import os
import re
# multiline, ascii only regex matches Start or Total at start of line
start_tag_finder = re.compile(rb'(?am)\nStart|\nTotal').search
for filename in files: # TODO: I'm not sure where "files" comes from...
# NOTE: no need to join cwd, relative paths do that automatically
if not os.path.isdir(filename) and not os.path.islink(filename):
with open(filename, 'rb+') as f:
# NOTE: you can cap file size if you'd like
if os.stat(filename).st_size > 1000000:
print(filename, "overflowed 10M size limit")
continue
search = start_tag_finder(f.read())
if search:
f.truncate(search.start())
I would try this for everything after you get your fileList together:
for file in fileList:
keepRows = []
open(file, 'r') as oFile:
for row in oFile:
if row[0] != "Start":
keepRows += row
else:
oFile.close()
with open(file, 'wb+') as nFile:
writer = csv.writer(nFile, delimiter=',')
writer.writerow([keepRows])
This opens your original file, gets the lines you wants, closes it and opens it with the w+. This overwrites the file, keeping the name, but clears it out via truncate and then will write each of the rows you wanted to keep on each row of the cleared out file.
Alternatively, you could create a new file for each csv doing:
for file in fileList:
keepRows = []
with open(file, 'r') as oFile, open('new_file.csv', 'a') as nFile:
for row in oFile:
if row[0] != "Start":
keepRows += row
else:
oFile.close()
for row in keepRows:
nFile.write(row)
Opening with a puts the cursor in the next row each time since this is append. The .writerow method before users iterables which is why it is in [] for the object where as each group, or row, in keepRows while in append does not need iterables and will write each item within the grouping to its own column, move to the next row and do the same thing.
EDIT: Updated syntax for binary file mode and .writer().

Use Python to split a CSV file with multiple headers

I have a CSV file that is being constantly appended. It has multiple headers and the only common thing among the headers is that the first column is always "NAME".
How do I split the single CSV file into separate CSV files, one for each header row?
here is a sample file:
"NAME","AGE","SEX","WEIGHT","CITY"
"Bob",20,"M",120,"New York"
"Peter",33,"M",220,"Toronto"
"Mary",43,"F",130,"Miami"
"NAME","COUNTRY","SPORT","NUMBER","SPORT","NUMBER"
"Larry","USA","Football",14,"Baseball",22
"Jenny","UK","Rugby",5,"Field Hockey",11
"Jacques","Canada","Hockey",19,"Volleyball",4
"NAME","DRINK","QTY"
"Jesse","Beer",6
"Wendel","Juice",1
"Angela","Milk",3
If the size of the csv files is not huge -- so all can be in memory at once -- just use read() to read the file into a string and then use a regex on this string:
import re
with open(ur_csv) as f:
data=f.read()
chunks=re.finditer(r'(^"NAME".*?)(?=^"NAME"|\Z)',data,re.S | re.M)
for i, chunk in enumerate(chunks, 1):
with open('/path/{}.csv'.format(i), 'w') as fout:
fout.write(chunk.group(1))
If the size of the file is a concern, you can use mmap to create something that looks like a big string but is not all in memory at the same time.
Then use the mmap string with a regex to separate the csv chunks like so:
import mmap
import re
with open(ur_csv) as f:
mf=mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
chunks=re.finditer(r'(^"NAME".*?)(?=^"NAME"|\Z)',mf,re.S | re.M)
for i, chunk in enumerate(chunks, 1):
with open('/path/{}.csv'.format(i), 'w') as fout:
fout.write(chunk.group(1))
In either case, this will write all the chunks in files named 1.csv, 2.csv etc.
Copy the input to a new output file each time you see a header line. Something like this (not checked for errors):
partNum = 1
outHandle = None
for line in open("yourfile.csv","r").readlines():
if line.startswith('"NAME"'):
if outHandle is not None:
outHandle.close()
outHandle = open("part%d.csv" % (partNum,), "w")
partNum += 1
outHandle.write(line)
outHandle.close()
The above will break if the input does not begin with a header line or if the input is empty.
You can use the python csv package to read your source file and write multile csv files based on the rule that if element 0 in your row == "NAME", spawn off a new file. Something like this...
import csv
outfile_name = "out_%.csv"
out_num = 1
with open('nameslist.csv', 'rb') as csvfile:
csvreader = csv.reader(csvfile, delimiter=',')
csv_buffer = []
for row in csvreader:
if row[0] != "NAME":
csv_buffer.append(row)
else:
with open(outfile_name % out_num, 'wb') as csvout:
for b_row in csv_buffer:
csvout.writerow(b_row)
out_num += 1
csv_buffer = [row]
P.S. I haven't actually tested this but that's the general concept
Given the other answers, the only modification that I would suggest would be to open using csv.DictReader. pseudo code would be like this. Assuming that the first line in the file is the first header
Note that this assumes that there is no blank line or other indicator between the entries so that a 'NAME' header occurs right after data. If there were a blank line between appended files the you could use that as an indicator to use infile.fieldnames() on the next row. If you need to handle the inputs as a list, then the previous answers are better.
ifile = open(filename, 'rb')
infile = cvs.Dictreader(ifile)
infields = infile.fieldnames
filenum = 1
ofile = open('outfile'+str(filenum), 'wb')
outfields = infields # This allows you to change the header field
outfile = csv.DictWriter(ofile, fieldnames=outfields, extrasaction='ignore')
outfile.writerow(dict((fn, fn) for fn in outfields))
for row in infile:
if row['NAME'] != 'NAME':
#process this row here and do whatever is needed
else:
close(ofile)
# build infields again from this row
infields = [row["NAME"], ...] # This assumes you know the names & order
# Dict cannot be pulled as a list and keep the order that you want.
filenum += 1
ofile = open('outfile'+str(filenum), 'wb')
outfields = infields # This allows you to change the header field
outfile = csv.DictWriter(ofile, fieldnames=outfields, extrasaction='ignore')
outfile.writerow(dict((fn, fn) for fn in outfields))
# This is the end of the loop. All data has been read and processed
close(ofile)
close(ifile)
If the exact order of the new header does not matter except for the name in the first entry, then you can transfer the new list as follows:
infileds = [row['NAME']
for k in row.keys():
if k != 'NAME':
infields.append(row[k])
This will create the new header with NAME in entry 0 but the others will not be in any particular order.

Building list of lists from CSV file

I have an Excel file(that I am exporting as a csv) that I want to parse, but I am having trouble with finding the best way to do it. The csv is a list of computers in my network, and what accounts are in the local administrator group for each one. I have done something similar with tuples, but the number of accounts for each computer range from 1 to 30. I want to build a list of lists, then go through each list to find the accounts that should be there(Administrator, etc.) and delete them, so that I can then export a list of only accounts that shouldn't be a local admin, but are. The csv file is formatted as follows:
"computer1" Administrator localadmin useraccount
"computer2" localadmin Administrator
"computer3" localadmin Administrator user2account
Any help would be appreciated
EDIT: Here is the code I am working with
import csv
import sys #used for passing in the argument
file_name = sys.argv[1] #filename is argument 1
with open(file_name, 'rU') as f: #opens PW file
reader = csv.reader(f)
data = list(list(rec) for rec in csv.reader(f, delimiter=',')) #reads csv into a list of lists
f.close() #close the csv
for i in range(len(data)):
print data[i][0] #this alone will print all the computer names
for j in range(len(data[i])) #Trying to run another for loop to print the usernames
print data[i][j]
The issue is with the second for loop. I want to be able to read across each line and for now, just print them.
This should get you on the right track:
import csv
import sys #used for passing in the argument
file_name = sys.argv[1] #filename is argument 1
with open(file_name, 'rU') as f: #opens PW file
reader = csv.reader(f)
data = list(list(rec) for rec in csv.reader(f, delimiter=',')) #reads csv into a list of lists
for row in data:
print row[0] #this alone will print all the computer names
for username in row: #Trying to run another for loop to print the usernames
print username
Last two lines will print all of the row (including the "computer"). Do
for x in range(1, len(row)):
print row[x]
... to avoid printing the computer twice.
Note that f.close() is not required when using the "with" construct because the resource will automatically be closed when the "with" block is exited.
Personally, I would just do:
import csv
import sys #used for passing in the argument
file_name = sys.argv[1] #filename is argument 1
with open(file_name, 'rU') as f: #opens PW file
reader = csv.reader(f)
# Print every value of every row.
for row in reader:
for value in row:
print value
That's a reasonable way to iterate through the data and should give you a firm basis to add whatever further logic is required.
This is how I opened a .csv file and imported columns of data as numpy arrays - naturally, you don't need numpy arrays, but...
data = {}
app = QApplication( sys.argv )
fname = unicode ( QFileDialog.getOpenFileName() )
app.quit()
filename = fname.strip('.csv') + ' for release.csv'
#open the file and skip the first two rows of data
imported_array = np.loadtxt(fname, delimiter=',', skiprows = 2)
data = {'time_s':imported_array[:,0]}
data['Speed_RPM'] = imported_array[:,1]
It can be done using the pandas library.
import pandas as pd
df = pd.read_csv(filename)
list_of_lists = df.values.tolist()
This approach applies to other kinds of data like .tsv, etc.

Categories