Read only defined columns of CSV - python

I wrote a python program that joins 2 csv tables according to a matching key.
My data looks like this:
Table 1:
ID;NAME;ADRESS;TEL
1; Lee; Str.; 12345
2; Chu; Blv.; 34567
Table 2:
AID; FID; XID
50 1 99
676 2 678
My code looks like this:
data = OrderedDict()
fieldnames = []
with open(join_file, "rt") as fp:
reader = csv.DictReader(fp, dialect=excel_semicolon)
fieldsB = reader.fieldnames
fieldnames.extend(fieldsB)
for row in reader:
data.setdefault(row["FID"], {}).update(row)
with open(fileA, "rt") as fp:
reader = csv.DictReader(fp, dialect=excel_semicolon)
fieldnames.extend(reader.fieldnames)
for row in reader:
data.setdefault(row["ID"], {}).update(row)
fieldnames = list(OrderedDict.fromkeys(fieldnames))
with open("merged2.csv", "wt", newline='') as fp:
writer = csv.writer(fp, dialect=excel_semicolon)
writer.writerow(fieldnames)
for row in data.values():
writer.writerow([row.get(field, '') for field in fieldnames],)
The join operation works like this, but my problem is that I want to remove certain fields from table 2 from the joined csv (e.g. XID). Is there a simple way to do this?
My solution prior to this was using Pandas but the script should run on a server where I don't want to (can't) install the dependencies for the import.

If you wish to take something out you can put in a simple filter using list comprehension.
You create the list here.
fieldnames = list(OrderedDict.fromkeys(fieldnames))
filter out what you do not want.
filtered_fieldnames = [x for x in fieldnames if x != 'XID']
Then change the new file data to the filtered list.
with open("merged2.csv", "wt", newline='') as fp:
writer = csv.writer(fp)
writer.writerow(filtered_fieldnames)
for row in data.values():
writer.writerow([row.get(field, '') for field in filtered_fieldnames],)
You can wrap it in a function and call it when you either create a new file or wish to take something out..
def create_merged_file(names):
with open("merged2.csv", "wt", newline='') as fp:
writer = csv.writer(fp)
writer.writerow(names)
for row in data.values():
writer.writerow([row.get(field, '') for field in names],)
create_merged_file(fieldnames)
filtered_fieldnames = [x for x in fieldnames if x != 'XID']
create_merged_file(filtered_fieldnames)

Related

Create multiple files from unique values of a column using inbuilt libraries of python

I started learning python and was wondering if there was a way to create multiple files from unique values of a column. I know there are 100's of ways of getting it done through pandas. But I am looking to have it done through inbuilt libraries. I couldn't find a single example where its done through inbuilt libraries.
Here is the sample csv file data:
uniquevalue|count
a|123
b|345
c|567
d|789
a|123
b|345
c|567
Sample output file:
a.csv
uniquevalue|count
a|123
a|123
b.csv
b|345
b|345
I am struggling with looping on unique values in a column and then print them out. Can someone explain with logic how to do it ? That will be much appreciated. Thanks.
import csv
from collections import defaultdict
header = []
data = defaultdict(list)
DELIMITER = "|"
with open("inputfile.csv", newline="") as csvfile:
reader = csv.reader(csvfile, delimiter=DELIMITER)
for i, row in enumerate(reader):
if i == 0:
header = row
else:
key = row[0]
data[key].append(row)
for key, value in data.items():
filename = f"{key}.csv"
with open(filename, "w", newline="") as f:
writer = csv.writer(f, delimiter=DELIMITER)
rows = [header] + value
writer.writerows(rows)
import csv
with open('sample.csv', newline='') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
with open(f"{row[0]}.csv", 'a') as inner:
writer = csv.writer(
inner, delimiter='|',
fieldnames=('uniquevalue', 'count')
)
writer.writerow(row)
the task can also be done without using csv module. the lines of the file are read, and with read_file.read().splitlines()[1:] the newline characters are stripped off, also skipping the header line of the csv file. with a set a unique collection of inputdata is created, that is used to count number of duplicates and to create the output files.
with open("unique_sample.csv", "r") as read_file:
items = read_file.read().splitlines()[1:]
for line in set(items):
with open(line[:line.index('|')] + '.csv', 'w') as output:
output.write((line + '\n') * items.count(line))

csv skipping appending data skips rows

I have python code for appending data to the same csv, but when I append the data, it skips rows, and starts from row 15, instead from row 4
import csv
with open('csvtask.csv', 'r') as csv_file:
csv_reader = csv.DictReader(csv_file)
ls = []
for line in csv_reader:
if len(line['Values'])!= 0:
ls.append(int(line['Values']))
new_ls = ['','','']
for i in range(len(ls)-1):
new_ls.append(ls[i+1]-ls[i])
print(new_ls)
with open('csvtask.csv','a',newline='') as new_file:
csv_writer = csv.writer(new_file)
for i in new_ls:
csv_writer.writerow(('','','','',i))
new_file.close()
Here is the image
It's not really feasible to update a file at the same time you're reading it, so a common workaround it to create a new file. The following does that while preserving the fieldnames in the origin file. The new column will be named Diff.
Since there's no previous value to use to calculate a difference for the first row, the rows of the files are processed using the built-in enumerate() function which provides a value each time it's called which provides the index of the item in the sequence as well as the item itself as the object is iterated. You can use the index to know whether the current row is the first one or not and handle in a special way.
import csv
# Read csv file and calculate values of new column.
with open('csvtask.csv', 'r', newline='') as file:
reader = csv.DictReader(file)
fieldnames = reader.fieldnames # Save for later.
diffs = []
prev_value = 0
for i, row in enumerate(reader):
row['Values'] = int(row['Values']) if row['Values'] else 0
diff = row['Values'] - prev_value if i > 0 else ''
prev_value = row['Values']
diffs.append(diff)
# Read file again and write an updated file with the column added to it.
fieldnames.append('Diff') # Name of new field.
with open('csvtask.csv', 'r', newline='') as inp:
reader = csv.DictReader(inp)
with open('csvtask_updated.csv', 'w', newline='') as outp:
writer = csv.DictWriter(outp, fieldnames)
writer.writeheader()
for i, row in enumerate(reader):
row.update({'Diff': diffs[i]}) # Add new column.
writer.writerow(row)
print('Done')
You can use the DictWriter function like this:-
header = ["data", "values"]
writer = csv.DictWriter(file, fieldnames = header)
data = [[1, 2], [4, 6]]
writer.writerows(data)

Compare a column between 2 csv files and write differences using Python

I am trying to print out the differences by comparing a column between 2 csv files.
CSV1:
SERVER, FQDN, IP_ADDRESS,
serverA, device1.com, 10.10.10.1
serverA,device2.com,10.11.11.1
serverC,device3.com,10.12.12.1
and so on..
CSV2:
FQDN, IP_ADDRESS, SERVER, LOCATION
device3.com,10.12.12.1,serverC,xx
device679.com,20.3.67.1,serverA,we
device1.com,10.10.10.1,serverA,ac
device345.com,192.168.2.0,serverA,ad
device2.com,192.168.6.0,serverB,af
and so on...
What I am looking to do is to compare the FQDN column and write the differences to a new csv output file. So my output would look something like this:
Output.csv:
FQDN, IP_ADDRESS, SERVER, LOCATION
device679.com,20.3.67.1,serverA,we
device345.com,192.168.2.0,serverA,ad
and so on..
I have tried, but not able to get the output.
This is my Code, please tell me where i am going wrong;
import csv
data = {} # creating list to store the data
with open('CSV1.csv', 'r') as lookuplist:
reader1 = csv.reader(lookuplist)
for col in reader1:
DATA[col[0]] = col[1]
with open('CSV2.csv', 'r') as csvinput, open('Output.csv', 'w', newline='') as f_output:
reader2 = csv.reader(csvinput)
csv_output = csv.writer(f_output)
fieldnames = (['FQDN', 'IP_ADDRESS', 'SERVER'])
csv_output.writerow(fieldnames) # prints header to the output file
for col in reader1:
if col[1] not in reader2:
csv_output.writerow(col)
(EDIT) This is another approach that I have used:
import csv
f1 = (open("CSV1.csv"))
f2 = (open("CSV2.csv"))
csv_f1 = csv.reader(f1)
csv_f2 = csv.reader(f2)
for col1, col2 in zip(csv_f1, csv_f2):
if col2[0] not in col1[1]:
print(col2[0])
Basically, here I am only trying to find out first whether the unmatched FQDNs are printed or not. But it is printing out the whole CSV1 column instead. Please help guys, lot of research has went into this, but found no luck yet! :(
This code uses the built-in difflib to spit out the lines from file1.csv that don't appear in file2.csv and vice versa.
I use the Differ object for identifying line changes.
I assumed that you would not regard line swapping as a difference, that's why I added the sorted() function call.
from difflib import Differ
csv_file1 = sorted(open("file1.csv", 'r').readlines())
csv_file2 = sorted(open("file2.csv", 'r').readlines())
with open("diff.csv", 'w') as f:
for line in Differ().compare(csv_file1,csv_file2)):
dmode, line = line[:2], line[2:]
if dmode.strip() == "":
continue
f.write(line + "\n")
Note that if the line differs somehow (not only in the FQDN column) it would appear in diff.csv
import csv
data = {} # creating list to store the data
with open('CSV1.csv', 'r') as lookuplist, open('CSV2.csv', 'r') as csvinput, open('Output.csv', 'w') as f_output:
reader1 = csv.reader(lookuplist)
reader2 = csv.reader(csvinput)
csv_output = csv.writer(f_output)
fieldnames = (['FQDN', 'IP_ADDRESS', 'SERVER', 'LOCATION'])
csv_output.writerow(fieldnames) # prints header to the output file
_tempFqdn = []
for i,dt in enumerate(reader1):
if i==0:
continue
_tempFqdn.append(dt[1].strip())
for i,col in enumerate(reader2):
if i==0:
continue
if col[0].strip() not in _tempFqdn:
csv_output.writerow(col)
import csv
data = {} # creating dictionary to store the data
with open('CSV1.csv', 'r') as lookuplist:
reader1 = csv.reader(lookuplist)
for col in reader1:
data[col[1]] = col[1] # stores the data from column 0 to column 1 in the data list
with open('CSV2.csv', 'r') as csvinput, open('Output.csv', 'w', newline='') as f_output:
reader2 = csv.reader(csvinput)
csv_output = csv.writer(f_output)
fieldnames = (['SERVER', 'FQDN', 'AUTOMATION_ADMINISTRATOR', 'IP_ADDRESS', 'PRIMARY_1', 'MHT_1', 'MHT_2',
'MHT_3'])
csv_output.writerow(fieldnames) # prints header to the output file
for col in reader2:
if col[0] not in data: # if the column 1 in CSV1 does not match with column 0 in CSV2 Extract
col = [col[0]]
csv_output.writerow(col) # writes all the data that is matched in CMDB WLC Extract
So basically, I only had to change 'not in' under 'for loop' and change the columns in the data list that will be reading from the CSV1 file that I am creating.

How to sort uneven dictionary by key and create CSV

I have a python dictionary which for each KEY one can have a variable number of VALUES (arranged in a list).
For example:
{'607': [36146], '448': [50890, 44513], '626': [44349, 44436]}
What I'd like to do is generate a CSV of this information with a format like so:
448 , 607 , 626
50890,36146,44349
44513, ,44436
Currently my code can produce a CSV such as this, the only issue being that the columns of the CSV are not sorted according to the ascending numerical order of the KEYs. My code so far is below:
csv_file = 'file.csv'
with open(csv_file, 'wb') as fd:
writer = csv.writer(fd, delimiter = ',')
# Format headers for aesthetics
csv_headers = [' {} '.format(elem) for elem in dictionary.keys()]
writer.writerow(headers)
# Format data to create convenient csv format
csv_data = itertools.izip_longest(*dictionary.values(), fillvalue = ' ')
writer.writerows(csv_data)
As you can see I split the KEYs from the VALUEs and write them separately but if I want to sort the columns by the KEYs I imagine this is probably not the best way to go about this. Therefore, I was hoping someone could point me in the right (and most pythonic) direction.
You have two options:
Sort the keys, then extract values in the same order rather than rely on dictionary.values()
Use a csv.DictWriter() object and produce dictionaries per row.
Option 1 looks like this:
csv_file = 'file.csv'
with open(csv_file, 'wb') as fd:
writer = csv.writer(fd, delimiter=',')
keys = sorted(dictionary)
# Format headers for aesthetics
headers = [' {} '.format(key) for key in keys]
writer.writerow(headers)
# Format data to create convenient csv format
csv_data = itertools.izip_longest(*(dictionary[key] for key in keys),
fillvalue=' ')
writer.writerows(csv_data)
Using DictWriter would look like:
csv_file = 'file.csv'
with open(csv_file, 'wb') as fd:
writer = csv.DictWriter(
fd, sorted(dictionary), delimiter=',')
# write formatted headers
writer.writerow({k: ' {} '.format(k) for k in dicitonary})
csv_data = itertools.izip_longest(*dictionary.values(), fillvalue=' ')
writer.writerows(dict(zip(dictionary, row)) for row in csv_data)
I went for sorting and ending up with a transposed tuple of key and an iterable of the lists, then went from there:
import csv
from itertools import izip_longest
d = {'607': [36146], '448': [50890, 44513], '626': [44349, 44436]}
with open('output.csv', 'wb') as fout:
csvout = csv.writer(fout)
header, rows = zip(*sorted((k, iter(v)) for k, v in d.iteritems()))
csvout.writerow(format(el, '^5') for el in header)
csvout.writerows(izip_longest(*rows, fillvalue=' '))

How to read one single line of csv data in Python?

There is a lot of examples of reading csv data using python, like this one:
import csv
with open('some.csv', newline='') as f:
reader = csv.reader(f)
for row in reader:
print(row)
I only want to read one line of data and enter it into various variables. How do I do that? I've looked everywhere for a working example.
My code only retrieves the value for i, and none of the other values
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
for row in reader:
i = int(row[0])
a1 = int(row[1])
b1 = int(row[2])
c1 = int(row[2])
x1 = int(row[2])
y1 = int(row[2])
z1 = int(row[2])
To read only the first row of the csv file use next() on the reader object.
with open('some.csv', newline='') as f:
reader = csv.reader(f)
row1 = next(reader) # gets the first line
# now do something here
# if first row is the header, then you can do one more next() to get the next row:
# row2 = next(f)
or :
with open('some.csv', newline='') as f:
reader = csv.reader(f)
for row in reader:
# do something here with `row`
break
you could get just the first row like:
with open('some.csv', newline='') as f:
csv_reader = csv.reader(f)
csv_headings = next(csv_reader)
first_line = next(csv_reader)
You can use Pandas library to read the first few lines from the huge dataset.
import pandas as pd
data = pd.read_csv("names.csv", nrows=1)
You can mention the number of lines to be read in the nrows parameter.
Just for reference, a for loop can be used after getting the first row to get the rest of the file:
with open('file.csv', newline='') as f:
reader = csv.reader(f)
row1 = next(reader) # gets the first line
for row in reader:
print(row) # prints rows 2 and onward
From the Python documentation:
And while the module doesn’t directly support parsing strings, it can easily be done:
import csv
for row in csv.reader(['one,two,three']):
print row
Just drop your string data into a singleton list.
The simple way to get any row in csv file
import csv
csvfile = open('some.csv','rb')
csvFileArray = []
for row in csv.reader(csvfile, delimiter = '.'):
csvFileArray.append(row)
print(csvFileArray[0])
To print a range of line, in this case from line 4 to 7
import csv
with open('california_housing_test.csv') as csv_file:
data = csv.reader(csv_file)
for row in list(data)[4:7]:
print(row)
I think the simplest way is the best way, and in this case (and in most others) is one without using external libraries (pandas) or modules (csv). So, here is the simple answer.
""" no need to give any mode, keep it simple """
with open('some.csv') as f:
""" store in a variable to be used later """
my_line = f.nextline()
""" do what you like with 'my_line' now """

Categories