How would I remove non-duplicates from a CSV file in Python? - python

I have a CSV file with the following format:
User1, User2
1,2
1,3
2,1
I am trying to remove the non-duplicated reverse values, so in this instance I would like to remove 1,3 as it doesn't have a reverse duplicate of 3,1
I currently have this script which removes the duplicate values:
import csv
reader=csv.reader(open('facebook-friendships.csv', 'r'), delimiter=',')
writer=csv.writer(open('facebook-friendships-write.csv', 'w'), delimiter=',')
entries = set()
for row in reader:
key = (row[0], row[1])
key = tuple(key)
if key in entries and key[::-1] in entries:
writer.writerow(row)
entries.add(key)
How would I edit this script to remove the non-duplicated reverse values? So that I am only left with
1,2
2,1
Thanks

Question: Remove non-duplicates from a CSV file?
First aggregate the data using the sum(...) of the data as key
Loop the aggregated data and write only if more than one aggregated in a specific key.
reader = [('1','2'), ('1','3'), ('2','1')]
data_counter = {}
for row in reader:
key = sum([int(v) for v in row])
data_counter.setdefault(key, []).append(row)
print(data_counter)
for value in data_counter.values():
if len(value) == 1:
print('non duplicate {}'.format(value))
else:
# writer.writerows(value)
print('writer.writerows({})'.format(value))
Output:
{3: [('1', '2'), ('2', '1')], 4: [('1', '3')]}
writer.writerows([('1', '2'), ('2', '1')])
non duplicate [('1', '3')]

reader = [
[1, 2],
[1, 3],
[2, 1],
[4, 5],
[5, 4],
]
entries_dict = {}
for row in reader:
sorted_row = tuple(sorted(row))
if sorted_row not in entries_dict:
entries_dict[sorted_row] = [row]
else:
entries_dict[sorted_row] += [row]
for duplic_rows in [rows_item for rows_item in entries_dict.values() if len(rows_item) > 1]:
for row in duplic_rows:
print(row) # writer.writerow(row)
Or you can use groupby:
from itertools import groupby
reader = [
[1, 2],
[1, 3],
[2, 1],
[4, 5],
[5, 4],
]
def get_key_from_row(row):
return tuple(sorted(row))
sorted_rows = sorted(reader, key=get_key_from_row)
grouped_rows = [list(it) for k, it in groupby(sorted_rows, get_key_from_row)]
for duplic_rows in [rows for rows in grouped_rows if len(rows) > 1]:
for row in duplic_rows:
print(row) # writer.writerow(row)

You could simply loop through each row and check whether a particular row's reverse exists or not. If yes, then it is written in the same sequence as present in the original csv file. Advantage being the original sequence is maintained.
import csv
reader=csv.reader(open('facebook-friendships.csv', 'r'), delimiter=',')
writer=csv.writer(open('facebook-friendships-write.csv', 'w'), delimiter=',')
allRows = []
for row in reader:
allRows.append(row)
for i in range(len(allRows)):
for j in range(len(allRows)):
#Checking whether or not the index is same.
#If index is same then there's no need to check the value at that particular index.
if i!=j:
if allRows[i]==allRows[j] or allRows[i][::-1]==allRows[j]:
writer.writerow(tuple(allRows[i]))

Related

How to import data from CSV file that contains just one large row of data, into an array with separated values?

I currently have a CSV file that contains voltage values from an oscilloscope. The data is stored in a singular row made up of 5000 cells, like this:
[0, 0, 0, 2, 2, 2, 4, 2, 2, 2, 0, 0, 0...]
screenshot of part of excel file
When I try to import the data into an array, it creates an array of index 1, containing all 5000 values in that first array[0] index. So when I print array[0], it shows all 5000 values, and when I print array[1-4999], an error occurs. How can I have each value from the cell in its own spot in the array?
Here is my code:
import csv
array = []
with open("test2.csv", 'r') as f:
cols = csv.reader(f, delimiter=",")
for r in cols:
array.append([r])
print(array[0])
import csv
array = []
with open("test2.csv", 'r') as f:
cols = csv.reader(f, delimiter=",")
# r is a single element
for r in cols:
# loop through each item in r
for i in r:
# append each element as list element
array.append([i])
print(array)
Ouput:
[['0'], ['0'], ['0'], ['2'], ['4'], ['5']]

How to add a list with a list and integers into a csv?

I am trying to put the following into a csv. Here is my code
import csv
data = [[1, 2, 3], 4, 5]
with open('test.csv', 'w') as f:
writer = csv.writer(f)
writer.writerows(data)
I am getting the following error:
_csv.Error: iterable expected, not int
When writing writer.writerow, then the code works but gives [1, 2, 3], 4 and 5 as the columns.
I want the columns to be 1, 2, 3, 4, 5
Any help on how I can do it?
writerow isn't equivalent to writerows
>>> some_data = [[1,2,3],[4,5,6],[7,8,9]]
>>> writer.writerows(some_data)
>
1,2,3
4,5,6
7,8,9
>>> write.writerow(some_data)
>"[1, 2, 3]","[4, 5, 6]","[7, 8, 9]"
Try:
import csv
headers = [1,2,3,4,5]
some_data = ['Foo','Bar','Baz','Qux','Zoo']
more_data = [['d1','d2','d3'],['d4','d5','d6']]
with open('test.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(headers) # Takes an iterable of cells
writer.writerow(some_data)
writer.writerows(more_data) # Takes an iterable of iterables
And you'll get:
1,2,3,4,5
Foo,Bar,Baz,Qux,Zoo
d1,d2,d3
d4,d5,d6
import csv
data = [[1, 2, 3,4], 5,6]
print_data = []
with open('test.csv', 'w') as f:
writer = csv.writer(f)
# Following code flattens the list within a list,
# uses temporary 'print_data' to store values for printing to csv
for counter in range(len(data)):
if isinstance(data[counter], list)==1 :
print ('list found')
for val in data[counter]:
print_data.append(val)
else:
print_data.append(data[counter])
writer.writerow(print_data)

how to export two columns to csv in python

I am not familiar with how to export the list to the csv in python. Here is code for one list:
import csv
X = ([1,2,3],[7,8,9])
Y = ([4,5,6],[3,4,5])
for x in range(0,2,1):
csvfile = "C:/Temp/aaa.csv"
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
for val in x[0]:
writer.writerow([val])
And I want to the result:
Then how to modify the code(the main problem is how to change the column?)
To output multiple columns you can use zip() like:
Code:
import csv
x0 = [1, 2, 3]
y0 = [4, 5, 6]
x2 = [7, 8, 9]
y2 = [3, 4, 5]
csvfile = "aaa.csv"
with open(csvfile, "w") as output:
writer = csv.writer(output, lineterminator='\n')
writer.writerow(['x=0', None, None, 'x=2'])
writer.writerow(['x', 'y', None, 'x', 'y'])
for val in zip(x0, y0, [None] * len(x0), x2, y2):
writer.writerow(val)
Results:
x=0,,,x=2
x,y,,x,y
1,4,,7,3
2,5,,8,4
3,6,,9,5
You could try:
with open('file.csv') as fin:
reader = csv.reader(fin)
[fout.write(r[0],r[1]) for r in reader]
If you need further help, leave a comment.
When dealing with csv files you should really just use Pandas. Put your header and data into a dataframe, and then use the .to_csv method on that dataframe. Csv can get tricky when you have strings that contain commas, etc...
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html

Write multiple rows from dict using csv

Update: I do not want to use pandas because I have a list of dict's and want to write each one to disk as they come in (part of webscraping workflow).
I have a dict that I'd like to write to a csv file. I've come up with a solution, but I'd like to know if there's a more pythonic solution available. Here's what I envisioned (but doesn't work):
import csv
test_dict = {"review_id": [1, 2, 3, 4],
"text": [5, 6, 7, 8]}
with open('test.csv', 'w') as csvfile:
fieldnames = ["review_id", "text"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(test_dict)
Which would ideally result in:
review_id text
1 5
2 6
3 7
4 8
The code above doesn't seem to work that way I'd expect it to and throws a value error. So, I've turned to following solution (which does work, but seems verbose).
with open('test.csv', 'w') as csvfile:
fieldnames = ["review_id", "text"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
response = test_dict
cells = [{x: {key: val}} for key, vals in response.items()
for x, val in enumerate(vals)]
rows = {}
for d in cells:
for key, val in d.items():
if key in rows:
rows[key].update(d.get(key, None))
else:
rows[key] = d.get(key, None)
for row in [val for _, val in rows.items()]:
writer.writerow(row)
Again, to reiterate what I'm looking for: the block of code directly above works (i.e., produces the desired result mentioned early in the post), but seems verbose. So, is there a more pythonic solution?
Thanks!
Your first example will work with minor edits. DictWriter expects a list of dicts rather than a dict of lists. Assuming you can't change the format of the test_dict:
import csv
test_dict = {"review_id": [1, 2, 3, 4],
"text": [5, 6, 7, 8]}
def convert_dict(mydict, numentries):
data = []
for i in range(numentries):
row = {}
for k, l in mydict.iteritems():
row[k] = l[i]
data.append(row)
return data
with open('test.csv', 'w') as csvfile:
fieldnames = ["review_id", "text"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(convert_dict(test_dict, 4))
Try using pandas of python..
Here is a simple example
import pandas as pd
test_dict = {"review_id": [1, 2, 3, 4],
"text": [5, 6, 7, 8]}
d1 = pd.DataFrame(test_dict)
d1.to_csv("output.csv")
Cheers
The built-in zip function can join together different iterables into tuples which can be passed to writerows. Try this as the last line:
writer.writerows(zip(test_dict["review_id"], test_dict["text"]))
You can see what it's doing by making a list:
>>> list(zip(test_dict["review_id"], test_dict["text"]))
[(1, 5), (2, 6), (3, 7), (4, 8)]
Edit: In this particular case, you probably want a regular csv.Writer, since what you effectively have is now a list.
If you don't mind using a 3rd-party package, you could do it with pandas.
import pandas as pd
pd.DataFrame(test_dict).to_csv('test.csv', index=False)
update
So, you have several dictionaries and all of them seems to come from a scraping routine.
import pandas as pd
test_dict = {"review_id": [1, 2, 3, 4],
"text": [5, 6, 7, 8]}
pd.DataFrame(test_dict).to_csv('test.csv', index=False)
list_of_dicts = [test_dict, test_dict]
for d in list_of_dicts:
pd.DataFrame(d).to_csv('test.csv', index=False, mode='a', header=False)
This time, you would be appending to the file and without the header.
The output is:
review_id,text
1,5
2,6
3,7
4,8
1,5
2,6
3,7
4,8
1,5
2,6
3,7
4,8
The problem is that with DictWriter.writerows() you are forced to have a dict for each row. Instead you can simply add the values changing your csv creation:
with open('test.csv', 'w') as csvfile:
fieldnames = test_dict.keys()
fieldvalues = zip(*test_dict.values())
writer = csv.writer(csvfile)
writer.writerow(fieldnames)
writer.writerows(fieldvalues)
You have two different problems in your question:
Create a csv file from a dictionary where the values are containers and not primitives.
For the first problem, the solution is generally to transform the container type into a primitive type. The most common method is creating a json-string. So for example:
>>> import json
>>> x = [2, 4, 6, 8, 10]
>>> json_string = json.dumps(x)
>>> json_string
'[2, 4, 6, 8, 10]'
So your data conversion might look like:
import json
def convert(datadict):
'''Generator which converts a dictionary of containers into a dictionary of json-strings.
args:
datadict(dict): dictionary which needs conversion
yield:
tuple: key and string
'''
for key, value in datadict.items():
yield key, json.dumps(value)
def dump_to_csv_using_dict(datadict, fields=None, filepath=None, delimiter=None):
'''Dumps a datadict value into csv
args:
datadict(list): list of dictionaries to dump
fieldnames(list): field sequence to use from the dictionary [default: sorted(datadict.keys())]
filepath(str): filepath to save to [default: 'tmp.csv']
delimiter(str): delimiter to use in csv [default: '|']
'''
fieldnames = sorted(datadict.keys()) if fields is None else fields
filepath = 'tmp.csv' if filepath is None else filepath
delimiter = '|' if not delimiter else delimiter
with open(filepath, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames, restval='', extrasaction='ignore', delimiter=delimiter)
writer.writeheader()
for each_dict in datadict:
writer.writerow(each_dict)
So the naive conversion looks like this:
# Conversion code
test_data = {
"review_id": [1, 2, 3, 4],
"text": [5, 6, 7, 8]}
}
converted_data = dict(convert(test_data))
data_list = [converted_data]
dump_to_csv(data_list)
Create a final value that is actually some sort of a merging of two disparate data sets.
To do this, you need to find a way to combine data from different keys. This is not an easy problem to generically solve.
That said, it's easy to combine two lists with zip.
>>> x = [2, 4, 6]
>>> y = [1, 3, 5]
>>> zip(y, x)
[(1, 2), (3, 4), (5, 6)]
In addition, in the event that your lists are not the same size, python's itertools package provides a method, izip_longest, which will yield back the full zip even if one list is shorter than another. Note izip_longest returns a generator.
from itertools import izip_longest
>>> x = [2, 4]
>>> y = [1, 3, 5]
>>> z = izip_longest(y, x, fillvalue=None) # default fillvalue is None
>>> list(z) # z is a generator
[(1, 2), (3, 4), (5, None)]
So we could add another function here:
from itertoops import izip_longest
def combine(data, fields=None, default=None):
'''Combines fields within data
args:
data(dict): a dictionary with lists as values
fields(list): a list of keys to combine [default: all fields in random order]
default: default fill value [default: None]
yields:
tuple: columns combined into rows
'''
fields = data.keys() if field is None else field
columns = [data.get(field) for field in fields]
for values in izip_longest(*columns, fillvalue=default):
yield values
And now we can use this to update our original conversion.
def dump_to_csv(data, filepath=None, delimiter=None):
'''Dumps list into csv
args:
data(list): list of values to dump
filepath(str): filepath to save to [default: 'tmp.csv']
delimiter(str): delimiter to use in csv [default: '|']
'''
fieldnames = sorted(datadict.keys()) if fields is None else fields
filepath = 'tmp.csv' if filepath is None else filepath
delimiter = '|' if not delimiter else delimiter
with open(filepath, 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=delimiter)
for each_row in data:
writer.writerow(each_dict)
# Conversion code
test_data = {
"review_id": [1, 2, 3, 4],
"text": [5, 6, 7, 8]}
}
combined_data = combine(test_data)
data_list = [combined_data]
dump_to_csv(data_list)

How to read the first row of an array in Python

I am new to learning Python, here is my current code:
#!/usr/bin/python
l = []
with open('datad.dat', 'r') as f:
for line in f:
line = line.strip()
if len(line) > 0:
l.append(map(float, line.split()))
print l[:,1]
I attempted to do this but made the mistake of using FORTRAN syntax, and received the following error:
File "r1.py", line 9, in <module>
print l[:,1]
TypeError: list indices must be integers, not tuple
How would I go about getting the first row or column of an array?
To print the first row use l[0], to get columns you will need to transpose with zip print(list(zip(*l))[0]).
In [14]: l = [[1,2,3],[4,5,6],[7,8,9]]
In [15]: l[0] # first row
Out[15]: [1, 2, 3]
In [16]: l[1] # second row
Out[16]: [4, 5, 6]
In [17]: l[2] # third row
Out[17]: [7, 8, 9]
In [18]: t = list(zip(*l))
In [19] t[0] # first column
Out[19]: (1, 4, 7)
In [20]: t[1] # second column
Out20]: (2, 5, 8)
In [21]: t[2] # third column
Out[21]: (3, 6, 9)
The csv module may also be useful:
import csv
with open('datad.dat', 'r') as f:
reader = csv.reader(f)
l = [map(float, row) for row in reader]

Categories