So I have csv file with over 1m records:(https://i.imgur.com/rhIhy5u.png)
I need data to be arranged differently that "params" who repeats become column/row themselves for example category1, category2, category3 (there is over 20 categories and no repeats) but all the data maintain their relations.
I tried using "pandas" and "csv" in python but i am completly new to it and i never had anything to do with such a data.
import csv
with open('./data.csv', 'r') as _filehandler:
csv_file_reader = csv.reader(_filehandler)
param = [];
csv_file_reader = csv.DictReader(_filehandler)
for row in csv_file_reader:
if not row['Param'] in param:
param.append(row['Param']);
col = "";
for p in param:
col += str(p) + '; ';
print(col);
import numpy as np
np.savetxt('./SortedWexdord.csv', (parameters), delimiter=';', fmt='%s')
I tried to think about it but data is nor my forte, any ideas?
Here's something that should work. If you need more than one value per row normalized like this, you could edit line 9 (beginning category) to grab a list of values instead of just row[1].
import csv
data = {}
with open('data.csv', 'r') as file:
reader = csv.reader(file)
next(reader) # Skip header row
for row in reader:
category, value = row[0], row[1] # Assumes category is in column 0 and target value is in column 1
if category in data:
data[category].append(value)
else:
data[category] = [value] # New entry only for each unique category
with open('output.csv', 'wb') as file: # wb is write and binary, avoids double newlines on windows
writer = csv.writer(file)
writer.writerow(['Category', 'Value'])
for category in data:
print([category] + data[category])
writer.writerow([category] + data[category]) # Make a list starting with category and then listing each value
Related
I'm trying to 'translate' a code which used the module pandas into a code not using pandas.
the code looks like this:
my code
import pandas as pd
data=pd.read_csv('review.csv')
data
titles=data['book_title']
temp=[]
for name in titles:
temp.append(name)
temp_set=set(temp)
temp_list=list(temp_set)
temp_list
data_simple=data.filter(items=['book_title','stars_given'])
data_simple=data_simple.set_index('book_title')
result_table=[]
for title in temp_list:
book_data=data_simple.filter(like=title,axis=0)
average=book_data['stars_given'].mean()
result_table.append([title,average])
result=pd.DataFrame(data=result_table, columns=['book_title', 'average_rating'])
result
result.to_csv('average_rating.csv', index=False, encoding='cp949')
(check out the picture; my typing may not be accurate)
Not using the pandas module and only using the built-in modules, (such as beginning with 'import csv') can somebody please help changing the codes?
Suggest using:
CSV Module
List comprehension for filtering data
Code
import csv
# Load Data
with open('review.csv', 'r') as csv_file:
data = []
csv_reader = csv.DictReader(csv_file, delimiter=',')
for row in csv_reader:
data.append(row) # each row is a dictionary containing
# column names as keys
# and data in CSV file row as values
print(data)
# Names of unique book titles
temp = []
for name in [row['book_title'] for row in data]: # list comprehension for titles column
temp.append(name)
temp_set = set(temp)
temp_list=list(temp_set)
print(temp_list)
# Filter to book_titles and stars_given
# Each row is a dictioanry, using dictionary comprehension
data_simple = [{column:row[column] for column in ['book_title', 'stars_given']} for row in data]
print(data_simple)
# Mean of stars by title
# Dictionary to look up column indexes for book_title and stars_given
result_table = []
for title in temp_list:
# Filter to rows with title
book_data = [row for row in data_simple if row['book_title']==title]
# Sum up number of stars for book
sum_ = sum(int(row['stars_given']) for row in book_data)
average = sum_ / len(book_data)
result_table.append((title, average)) # store each as tuple
print(result_table)
# Create resulting CSV
with open('average_rating.csv', 'w', newline = '', encoding = 'cp949') as csv_file:
csv_writer = csv.writer(csv_file, delimiter=',')
csv_writer.writerow(['book_title', 'average_rating']) # Header
for row in result_table:
csv_writer.writerow(row)
Test
File: review.csv
book_title,stars_given,comment
abc,5,loved it
def,3,okay to watch
bce,2,too long
abc,4,very funny
File: average_rating.csv
book_title,average_rating
def,3.0
abc,4.5
bce,2.0
I think NumPy could make it?
import numpy as np
# using loadtxt()
arr = np.loadtxt("review.csv",
delimiter=",", dtype=str)
I am not sure but try Numpy.
I am trying to search for a file name in a CSV (in column A). If it finds it, then I want to print only the second column (column B), not the whole row.
The CSV is like this:
File Name,ID
1234.bmp,1A
1111.bmp,2B
This is what I have so far, but it prints both the columns:
import os
import csv
f_name = os.listdir(r'C:\Users\Peter\Documents\Python test\Files')[0]
data = []
with open ("test.csv") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
data.append(row)
col = [x[0] for x in data]
if f_name in col:
for x in range(len(data)):
if f_name ==data[x][0]:
action = print(data[x])
else:
print("File not listed")
You were close. You only had a problem with the indexing (and the print statement).
After this part of the code:
data = []
with open ("test.csv") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
data.append(row)
The data would now be a list of lists:
[
['File Name', 'ID'],
['1234.bmp', '1A'],
['1111.bmp', '2B']
]
In the part where you check the 1st column:
if f_name == data[x][0]:
action = print(data[x])
You printed data[x] which would be one row. You need to index it further to access the 2nd column:
print(data[x]) # ['1234.bmp', '1A']
print(data[x][1]) # 1A
Furthermore, print returns None, so None would be saved into action:
>>> action = print("123")
123
>>> print(action)
None
You need to assign the value to action then print(action):
if f_name == data[x][0]:
action = data[x][1]
print(action) # 1A or 2B
You can also further improve the code by eliminating col. I understand that it's for checking if f_name is in the 1st column ("File Name") of the CSV. Since you are already iterating over each row, you can already check it there if f_name is in row. If it finds it, store the index of that row in a variable (ex. idx_fname_in_csv), so that later, you can access it directly from data. This eliminates the extra variable col and avoids iterating over the data twice.
import os
import csv
f_name = os.listdir(r'C:\Users\Peter\Documents\Python test\Files')[0]
data = []
idx_fname_in_csv = -1 # invalid
with open("test.csv") as csvfile:
reader = csv.reader(csvfile)
for idx, row in enumerate(reader):
data.append(row)
if f_name in row:
idx_fname_in_csv = idx
if idx_fname_in_csv > 0:
action = data[idx_fname_in_csv][1]
print(action)
else:
print("File not listed")
Here data would still have the same contents (list of lists) but I used enumerate to keep track of the index.
I am trying to write a set of data to csv file. The file have headers and the header name auto increments against the number of values in output of a that field. For example if I have a Additional Skills column, and there are 17 skills so the header will be like
Additional Skills 1 + Endorsements .... Additional Skills 17 + Endorsements
Now, when I am writing the data against the field, I am able to write it properly if there are exactly 17 fields. But if there is another set of data, which has let's say 10 fields, it does write in 10 fields, now Considering that there are other columns after the "Additional Skills + Endorsements" for example "School" column, instead of writing "school" data in 'school' column the data gets written in "Additional Skills 11 + Endorsements"
My Code for Adding column field is as follows:
profile_eksills_len = 0
for profile in link_data:
new_profile_eksills = len(profile["skillsExtended"])
if new_profile_eksills > profile_eksills_len:
profile_eksills_len = new_profile_eksills
for i in range(profile_eksills_len):
profile_header.append("Additional Skills {} + Endorsements".format(i+1))
Code for writing the CSV file is as follows:
with open("data.csv", "w") as output:
writer = csv.writer(output, dialect='excel', delimiter='\t', )
writer.writerow(profile_header)
# get job title
for profile in link_data:
exp_data = [profile['name'], profile['info'], profile['currentJob'] ]
for exp in profile["exp"]:
if exp['jobDesc']:
exp_data.append(exp['title'] + ":" + exp['comp'] + ":" + exp['jobDesc'])
else:
exp_data.append(exp['title'] + ":" + exp['comp'])
for exp in profile["extras"]:
exp_data.append(exp['extras'])
for edu in profile['edu']:
exp_data.append(edu['school'])
for skills in profile["skills"]:
exp_data.append(skills['sets'] + ":" + skills['endorsementCounts'])
for skill in profile["skillsExtended"]:
exp_data.append(skill['extsets'] + ":" + skill['endorsedCount'])
print(exp_data)
# write data column wise....
writer.writerow(exp_data)
I would like to know if there is a way to achieve this?
Assuming that you know all your headers in advance, the best approach would be to collect your row data in dictionaries and use csv.DictWriter to write to the file.
DictWriter will handle missing fields automatically. By default it will populate them with an empty string, but you can provide an alternative value via DictReader's restval parameter.
The outline code would look like this:
fieldnames = ['heading1', 'heading2', ...]
with open("data.csv", "w") as output:
writer = csv.DictWriter(output, fieldnames, dialect='excel', delimiter='\t', )
writer.writeheader()
# get job title
for profile in link_data:
# Build a dictionary of the fieldnames and values for the row.
row_data = {'heading1': 'foo', 'heading2': 'bar',...}
writer.writerow(row_data)
with open(file_name, encoding='utf-8') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
header = next(csv_reader)
print("header:", header)
for row in csv_reader:
data = dict(zip(header,row))
print('combined dict data:', data)
Read header and then use zip to get the list of values
I have a csv file that i need to select certain rows. For me is easy remove the AGE and MEAN WEIGHT because these names are the same in any file.
ID,AGE,HEIGHT,MEAN WEIGHT,20-Nov-2002,05-Mar-2003,09-Apr-2003,23-Jul-2003
1,23,1.80,80,78,78,82,82
2,25,1.60,58,56,60,60,56
3,20,1.90,100,98,102,98,102
ID,HEIGHT,20-Nov-2002,05-Mar-2003,09-Apr-2003,23-Jul-2003
1,1.80,78,78,82,82
2,1.60,56,60,60,56
3,1.90,98,102,98,102
i have this code
import csv
out= open("C:/Users/Pedro/data.csv")
rdr= csv.reader(out)
result= open('C:/Users/Pedro/datanew.csv','w')
wtr= csv.writer ( result,delimiter=',',lineterminator='\n')
for row in rdr:
wtr.writerow( (row[0], row[2], row[4],row[5],row[6],row[7]) )
out.close()
result.close()
but my difficulty is select all columns that have dates. The number of columns of the dates may be variable. The solution could be to detect the character - in row[4]
I'm not 100 % sure what's you're asking, but here is a script that may do what you want, which is to reproduce the file with all of an unknown number of date columns, plus your columns 0 and 2 (ID & HEIGHT):
import csv
with open('data.csv') as infile: # Use 'with' to close files automatically
reader = csv.reader(infile)
headers = reader.next() # Read first line
# Figure out which columns have '-' in them (assume these are dates)
date_columns = [col for col, header in enumerate(headers) if '-' in header]
# Add our desired other columns
all_columns = [0, 2] + date_columns
with open('new.csv', 'w') as outfile:
writer = csv.writer(outfile, delimiter=',', lineterminator='\n')
# print headers
writer.writerow([headers[i] for i in all_columns])
# print data
for row in reader: # Read remaining data from our input CSV
writer.writerow([row[i] for i in all_columns])
Does that help?
I have the csv file as follows:
product_name, product_id, category_id
book, , 3
shoe, 3, 1
lemon, 2, 4
I would like to update product_id of each row by providing the column name using python's csv library.
So for an example if I pass:
update_data = {"product_id": [1,2,3]}
then the csv file should be:
product_name, product_id, category_id
book, 1, 3
shoe, 2, 1
lemon, 3, 4
You can use your existing dict and iter to take items in order, eg:
import csv
update_data = {"product_id": [1,2,3]}
# Convert the values of your dict to be directly iterable so we can `next` them
to_update = {k: iter(v) for k, v in update_data.items()}
with open('input.csv', 'rb') as fin, open('output.csv', 'wb') as fout:
# create in/out csv readers, skip intial space so it matches the update dict
# and write the header out
csvin = csv.DictReader(fin, skipinitialspace=True)
csvout = csv.DictWriter(fout, csvin.fieldnames)
csvout.writeheader()
for row in csvin:
# Update rows - if we have something left and it's in the update dictionary,
# use that value, otherwise we use the value that's already in the column.
row.update({k: next(to_update[k], row[k]) for k in row if k in to_update})
csvout.writerow(row)
Now - this assumes that each new column value goes to the row number and that the existing values should be used after that. You could change that logic to only use new values when the existing value is blank for instance (or whatever other criteria you wish).
(assuming you're using 3.x)
Python has a CSV module in the standard library which helps read and amend CSV files.
Using that I'd find the index for the column you are after and store it in the dictionary you've made. Once that has been found it's simply a matter of popping the list item into each row.
import csv
update_data = {"product_id": [None, [1,2,3]]}
#I've nested the original list inside another so that we can hold the column index in the first position.
line_no = 0
#simple counter for the first step.
new_csv = []
#Holds the new rows for when we rewrite the file.
with open('test.csv', 'r') as csvfile:
filereader = csv.reader(csvfile)
for line in filereader:
if line_no == 0:
for key in update_data:
update_data[key][0] = line.index(key)
#This finds us the columns index and stores it for us.
else:
for key in update_data:
line[update_data[key][0]] = update_data[key][1].pop(0)
#using the column index we enter the new data into the correct place whilst removing it from the input list.
new_csv.append(line)
line_no +=1
with open('test.csv', 'w') as csvfile:
filewriter = csv.writer(csvfile)
for line in new_csv:
filewriter.writerow(line)