Get rows from CSV by matching header to multiple dictionary key-values - python

I have a CSV file with header and I want to retrieve all the rows from CSV that matches a dictionary key-value. Note that dictionary can contain any number of orbitary key and value to match with.
Here is the code I have written to solve this, is there any other better way to approach this (other than pandas dataframe)?
Better way mean - removal of unnecessary variable if any? better data structure, better library, reducing space/time complexity than below solution
options = {'h1': 'v1', 'h2': 'v2'}
output = []
with open("data.csv", "rt") as csvfile:
data = csv.reader(csvfile, delimiter=',', quotechar='"')
header = next(data)
for row in data:
match = 0
for k, v in options.items():
match += 1 if row[header.index(k)] == v else 0
if len(options.keys()) == match:
output.append(dict(zip(header, row)))
return output

You don't say what you would consider a "better" approach to be. That said, it would take fewer lines of code if you used a csv.DictReader to process the input file as illustrated.
import csv
def find_matching_rows(filename, criteria, delimiter=',', quotechar='"'):
criteria_values = tuple(criteria.values())
matches = []
with open(filename, 'r', newline='') as csvfile:
for row in csv.DictReader(csvfile, delimiter=delimiter, quotechar=quotechar):
if tuple(row[key] for key in criteria) == criteria_values:
matches.append(row)
return matches
results = find_matching_rows('matchtest.csv', {'h1': 'v1', 'h2': 'v2'})
for row in results:
print(row)

You can use a list comprehension to read and filter the rows of a DictReader. Make the wanted options a set and then its an easy test for intersection.
import csv
def test():
options = {'h1': 'v1', 'h2': 'v2'}
wanted = set(options.items())
with open("data.csv", "rt", newline="") as csvfile:
return [row for row in csv.DictReader(csvfile) if set(row.items()) & wanted]
print(test())
print(len(test()))

Related

Create multiple files from unique values of a column using inbuilt libraries of python

I started learning python and was wondering if there was a way to create multiple files from unique values of a column. I know there are 100's of ways of getting it done through pandas. But I am looking to have it done through inbuilt libraries. I couldn't find a single example where its done through inbuilt libraries.
Here is the sample csv file data:
uniquevalue|count
a|123
b|345
c|567
d|789
a|123
b|345
c|567
Sample output file:
a.csv
uniquevalue|count
a|123
a|123
b.csv
b|345
b|345
I am struggling with looping on unique values in a column and then print them out. Can someone explain with logic how to do it ? That will be much appreciated. Thanks.
import csv
from collections import defaultdict
header = []
data = defaultdict(list)
DELIMITER = "|"
with open("inputfile.csv", newline="") as csvfile:
reader = csv.reader(csvfile, delimiter=DELIMITER)
for i, row in enumerate(reader):
if i == 0:
header = row
else:
key = row[0]
data[key].append(row)
for key, value in data.items():
filename = f"{key}.csv"
with open(filename, "w", newline="") as f:
writer = csv.writer(f, delimiter=DELIMITER)
rows = [header] + value
writer.writerows(rows)
import csv
with open('sample.csv', newline='') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
with open(f"{row[0]}.csv", 'a') as inner:
writer = csv.writer(
inner, delimiter='|',
fieldnames=('uniquevalue', 'count')
)
writer.writerow(row)
the task can also be done without using csv module. the lines of the file are read, and with read_file.read().splitlines()[1:] the newline characters are stripped off, also skipping the header line of the csv file. with a set a unique collection of inputdata is created, that is used to count number of duplicates and to create the output files.
with open("unique_sample.csv", "r") as read_file:
items = read_file.read().splitlines()[1:]
for line in set(items):
with open(line[:line.index('|')] + '.csv', 'w') as output:
output.write((line + '\n') * items.count(line))

csv skipping appending data skips rows

I have python code for appending data to the same csv, but when I append the data, it skips rows, and starts from row 15, instead from row 4
import csv
with open('csvtask.csv', 'r') as csv_file:
csv_reader = csv.DictReader(csv_file)
ls = []
for line in csv_reader:
if len(line['Values'])!= 0:
ls.append(int(line['Values']))
new_ls = ['','','']
for i in range(len(ls)-1):
new_ls.append(ls[i+1]-ls[i])
print(new_ls)
with open('csvtask.csv','a',newline='') as new_file:
csv_writer = csv.writer(new_file)
for i in new_ls:
csv_writer.writerow(('','','','',i))
new_file.close()
Here is the image
It's not really feasible to update a file at the same time you're reading it, so a common workaround it to create a new file. The following does that while preserving the fieldnames in the origin file. The new column will be named Diff.
Since there's no previous value to use to calculate a difference for the first row, the rows of the files are processed using the built-in enumerate() function which provides a value each time it's called which provides the index of the item in the sequence as well as the item itself as the object is iterated. You can use the index to know whether the current row is the first one or not and handle in a special way.
import csv
# Read csv file and calculate values of new column.
with open('csvtask.csv', 'r', newline='') as file:
reader = csv.DictReader(file)
fieldnames = reader.fieldnames # Save for later.
diffs = []
prev_value = 0
for i, row in enumerate(reader):
row['Values'] = int(row['Values']) if row['Values'] else 0
diff = row['Values'] - prev_value if i > 0 else ''
prev_value = row['Values']
diffs.append(diff)
# Read file again and write an updated file with the column added to it.
fieldnames.append('Diff') # Name of new field.
with open('csvtask.csv', 'r', newline='') as inp:
reader = csv.DictReader(inp)
with open('csvtask_updated.csv', 'w', newline='') as outp:
writer = csv.DictWriter(outp, fieldnames)
writer.writeheader()
for i, row in enumerate(reader):
row.update({'Diff': diffs[i]}) # Add new column.
writer.writerow(row)
print('Done')
You can use the DictWriter function like this:-
header = ["data", "values"]
writer = csv.DictWriter(file, fieldnames = header)
data = [[1, 2], [4, 6]]
writer.writerows(data)

Python read CSV file, and write to another skipping columns

I have CSV input file with 18 columns
I need to create new CSV file with all columns from input except column 4 and 5
My function now looks like
def modify_csv_report(input_csv, output_csv):
begin = 0
end = 3
with open(input_csv, "r") as file_in:
with open(output_csv, "w") as file_out:
writer = csv.writer(file_out)
for row in csv.reader(file_in):
writer.writerow(row[begin:end])
return output_csv
So it reads and writes columns number 0 - 3, but i don't know how skip column 4,5 and continue from there
You can add the other part of the row using slicing, like you did with the first part:
writer.writerow(row[:4] + row[6:])
Note that to include column 3, the stop index of the first slice should be 4. Specifying start index 0 is also usually not necessary.
A more general approach would employ a list comprehension and enumerate:
exclude = (4, 5)
writer.writerow([r for i, r in enumerate(row) if i not in exclude])
If your CSV has meaningful headers an alternative solution to slicing your rows by indices, is to use the DictReader and DictWriter classes.
#!/usr/bin/env python
from csv import DictReader, DictWriter
data = '''A,B,C
1,2,3
4,5,6
6,7,8'''
reader = DictReader(data.split('\n'))
# You'll need your fieldnames first in a list to ensure order
fieldnames = ['A', 'C']
# We'll also use a set for efficient lookup
fieldnames_set = set(fieldnames)
with open('outfile.csv', 'w') as outfile:
writer = DictWriter(outfile, fieldnames)
writer.writeheader()
for row in reader:
# Use a dictionary comprehension to iterate over the key, value pairs
# discarding those pairs whose key is not in the set
filtered_row = dict(
(k, v) for k, v in row.iteritems() if k in fieldnames_set
)
writer.writerow(filtered_row)
This is what you want:
import csv
def remove_csv_columns(input_csv, output_csv, exclude_column_indices):
with open(input_csv) as file_in, open(output_csv, 'w') as file_out:
reader = csv.reader(file_in)
writer = csv.writer(file_out)
writer.writerows(
[col for idx, col in enumerate(row)
if idx not in exclude_column_indices]
for row in reader)
remove_csv_columns('in.csv', 'out.csv', (3, 4))

Read all columns from CSV file?

I am trying to read in a CSV file and then take all values from each column and put into a separate list. I do not want the values by row. Since the CSV reader only allows to loop through the file once, I am using the seek() method to go back to the beginning and read the next column. Besides using a Dict mapping, is there a better way to do this?
infile = open(fpath, "r")
reader = csv.reader(infile)
NOUNS = [col[0] for col in reader]
infile.seek(0) # <-- set the iterator to beginning of the input file
VERBS = [col[1] for col in reader]
infile.seek(0)
ADJECTIVES = [col[2] for col in reader]
infile.seek(0)
SENTENCES = [col[3] for col in reader]
Something like this would do it in one pass:
kinds = NOUNS, VERBS, ADJECTIVES, SENTENCES = [], [], [], []
with open(fpath, "r") as infile:
for cols in csv.reader(infile):
for i, kind in enumerate(kinds):
kind.append(cols[i])
You could feed the reader to zip and unpack it to variables as you wish.
import csv
with open('input.csv') as f:
first, second, third, fourth = zip(*csv.reader(f))
print('first: {}, second: {}, third: {}, fourth: {}'.format(
first, second, third, fourth
))
With following input:
1,2,3,4
A,B,C,D
It will produce output:
first: ('1', 'A'), second: ('2', 'B'), third: ('3', 'C'), fourth: ('4', 'D')
This works assuming you know exactly how many columns are in the csv (and there isn't a header row).
NOUNS = []
VERBS = []
ADJECTIVES = []
SENTENCES = []
with open(fpath, "r") as infile:
reader = csv.reader(infile)
for row in reader:
NOUNS.append(row[0])
VERBS.append(row[1])
ADJECTIVES.append(row[2])
SENTENCES.append(row[3])
If you don't know the column headers, you're going to have to be clever and read off the first row, make lists for every column you encounter, and loop through every new row and insert in the appropriate list. You'll probably need to do a list of lists.
If you don't mind adding a dependency, use Pandas. Use a DataFrame and the method read_csv(). Access each column using the column name i.e.
df = pandas.DataFrame.read_csv(fpath)
print df['NOUN']
print df['VERBS']
I am not sure why you dont want to use dict mapping. This is what I end up doing
Data
col1,col2,col3
val1,val2,val3
val4,val5,val6
Code
import csv
d = dict()
with open("abc.text") as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
for key, value in row.items():
if d.get(key) is None:
d[key] = [value]
else:
d[key].append(value)
print d
{'col2': ['val2', 'val5'], 'col3': ['val3', 'val6'], 'col1': ['val1', 'val4']}

How to sort uneven dictionary by key and create CSV

I have a python dictionary which for each KEY one can have a variable number of VALUES (arranged in a list).
For example:
{'607': [36146], '448': [50890, 44513], '626': [44349, 44436]}
What I'd like to do is generate a CSV of this information with a format like so:
448 , 607 , 626
50890,36146,44349
44513, ,44436
Currently my code can produce a CSV such as this, the only issue being that the columns of the CSV are not sorted according to the ascending numerical order of the KEYs. My code so far is below:
csv_file = 'file.csv'
with open(csv_file, 'wb') as fd:
writer = csv.writer(fd, delimiter = ',')
# Format headers for aesthetics
csv_headers = [' {} '.format(elem) for elem in dictionary.keys()]
writer.writerow(headers)
# Format data to create convenient csv format
csv_data = itertools.izip_longest(*dictionary.values(), fillvalue = ' ')
writer.writerows(csv_data)
As you can see I split the KEYs from the VALUEs and write them separately but if I want to sort the columns by the KEYs I imagine this is probably not the best way to go about this. Therefore, I was hoping someone could point me in the right (and most pythonic) direction.
You have two options:
Sort the keys, then extract values in the same order rather than rely on dictionary.values()
Use a csv.DictWriter() object and produce dictionaries per row.
Option 1 looks like this:
csv_file = 'file.csv'
with open(csv_file, 'wb') as fd:
writer = csv.writer(fd, delimiter=',')
keys = sorted(dictionary)
# Format headers for aesthetics
headers = [' {} '.format(key) for key in keys]
writer.writerow(headers)
# Format data to create convenient csv format
csv_data = itertools.izip_longest(*(dictionary[key] for key in keys),
fillvalue=' ')
writer.writerows(csv_data)
Using DictWriter would look like:
csv_file = 'file.csv'
with open(csv_file, 'wb') as fd:
writer = csv.DictWriter(
fd, sorted(dictionary), delimiter=',')
# write formatted headers
writer.writerow({k: ' {} '.format(k) for k in dicitonary})
csv_data = itertools.izip_longest(*dictionary.values(), fillvalue=' ')
writer.writerows(dict(zip(dictionary, row)) for row in csv_data)
I went for sorting and ending up with a transposed tuple of key and an iterable of the lists, then went from there:
import csv
from itertools import izip_longest
d = {'607': [36146], '448': [50890, 44513], '626': [44349, 44436]}
with open('output.csv', 'wb') as fout:
csvout = csv.writer(fout)
header, rows = zip(*sorted((k, iter(v)) for k, v in d.iteritems()))
csvout.writerow(format(el, '^5') for el in header)
csvout.writerows(izip_longest(*rows, fillvalue=' '))

Categories