Merging two csv files into list of dictionaries - python

i have a task to do and i got stuck because whatever i do it does't seem to work.
So i have to csv files.
First called persons_file and it contains header line: id, name, surname.
And visits_file containing id, person_id, site.
I have to write a function called merge that gets to files as arguments (both StrionIO type) and returns list of dictionaries with number of visits for each users:
[ {
"id": (person's id),
"name": (person's name),
"surname": (person's surname),
"visits": (number of visits)
} ]
I came up with this and i don't know where my mistake is.
import io
def merge(persons_file,visits_file):
line_counter = 0
return_list = []
list_of_person_ids = []
visits = 0
for row in visits_file:
if line_counter == 0:
line_counter+=1
continue
list_of_person_ids.append(row.split(',')[1])
line_counter = 0
for row in persons_file:
if line_counter == 0:
line_counter+=1
continue
help_dict = {}
split_row = row.split(',')
help_dict['id'] = split_row[0]
help_dict['name'] = split_row[1]
help_dict['surname'] = split_row[2][:len(split_row[2])-1]
if split_row[0] in list_of_person_ids:
visits = list_of_person_ids.count(split_row[0])
help_dict['visits'] = str(visits)
return_list.append(help_dict)
visits=0
return return_list
file1 = open('persons_file.csv' , mode='r')
file2 = open('visits_file.csv' , mode='r')
persons_file_arg = io.StringIO(file1.read())
visits_file_arg = io.StringIO(file2.read())
list_of_visits = merge(persons_file_arg,visits_file_arg)
for i in list_of_visits:
print(i)
file1.close()
file2.close()
I will be glad if anyone could help me.

What is the issue? Is it the output that is not what you expected, or are you getting an exception? Your code seems like it should achieve the result you want, but I have a couple suggestions to make that could simplify things.
Look into collections.Counter you could then call count_of_visits_by_person_id = Counter(list_of_person_ids) to get a result of the form:
{person_id: number_of_visits, ...}. You could then use this to simply look up the number of visits in your next for loop. e.g.:
from collections import Counter
...
count_of_visits_by_person_id = Counter(list_of_person_ids)
for row in persons_file:
if line_counter == 0:
line_counter += 1
continue
help_dict = {}
split_row = row.split(',')
help_dict['id'] = split_row[0]
help_dict['name'] = split_row[1]
help_dict['surname'] = split_row[2][:-1]
# [:len(split_row[2]) - 1] is equivalent to [:-1]
# I assume you are stripping whitespace from the right side,
# which can also be accomplished using split_row[2].rstrip()
if split_row[0] in count_of_visits_by_person_id:
visits = count_of_visits_by_person_id[split_row[0]]
else:
visits = 0
help_dict['visits'] = str(visits)
return_list.append(help_dict)
The generally simpler and safer way to open files is using the with statement. Here is an example:
with open('visits_file.csv', mode='r') as visits_file:
row = visits_file.readline()
while row:
row = visits_file.readline() # Skips the first line
list_of_person_ids.append(row.split(',')[1])

Related

How to convert a csv-file to a dictionnary of lists with python?

I'm trying to have this kind of result :
Here is the csv-file :
OsmID,NewName,IdLocal
1020287758,NN1,Id0001
1021229973,NN2,Id0002
1025409497,NN3,Id0003
I'm using the code below:
import csv
input = r'C:\Users\_M92\csvFiles\csv0001.csv'
fileRead = open(input, 'r')
with open(input, 'r') as csv:
headerLine = fileRead.readline()
header = headerLine.split(",")
#print(header)
nameIndex = header.index("OsmID")
output = {}
for line in fileRead.readlines():
values = line.split(",")
output[values[nameIndex]] = values
print(output)
And it results in the following error:
File "c:\Users\_M92\Scripts\CsvToDict.py",
line 19, in <module>
nameIndex = header.index("OsmID")
ValueError: 'OsmID' is not in list
Instead of manually splitting each line by commas, use the CSV module that you've imported. This module contains a DictReader class that will yield dictionaries for each row. Then, you just need to add this to your output dictionary.
# Create an empty dictionary
# We will add keys to this as needed
output = {}
# Keep track of number of rows, so we can add an empty column if needed
row_count = 0
# This function adds a row to the output dictionary
def add_row(row_dict):
global row_count # Need to declare this as global because we're assigning to the variable in this function
if not row_dict: return # If row is empty, do nothing
for k, v in row_dict.items():
# Loop over all key-value pairs in the row to add
if k not in output: # If the output doesn't contain this column, create a blank column
output[k] = [None] * row_count
output[k].append(v) # Append the value to the correct column in output
row_count += 1
input_file = r'C:\Users\_M92\csvFiles\csv0001.csv'
with open(input_file, 'r') as fh:
reader = csv.DictReader(fh) # Create a DictReader
for row in reader:
add_row(row) # Add every row to the output
This gives the following output:
{'OsmID': ['1020287758', '1021229973', '1025409497'],
'NewName': ['NN1', 'NN2', 'NN3'],
'IdLocal': ['Id0001', 'Id0002', 'Id0003']}
Note: I removed the blank lines in the input csv you provided, but it doesn't make a difference to the program, since a blank line will yield an empty dictionary from DictReader, and add_row doesn't do anything with empty dicts
Note 2: You could discard the row_count variable if you dynamically count the number of rows like so:
def add_row(row_dict):
row_count = 0
for first_key, first_val in output.items():
row_count = len(first_val)
break # We can just break out here because all keys should have the same number of values
# Create keys that do not yet exist in output but do exist in the new row
existing_keys = set(output.keys())
new_row_keys = set(row_dict.keys())
keys_to_create = new_row_keys - existing_keys
for key in keys_to_create:
output[key] = [None] * row_count
# Append to each column in output
for key in output:
output[key].append(row_dict.get(key, None)) # If the key doesn't exist in the current row, append None
You could use Pandas
import pandas as pd
f = r'C:\Users\_M92\csvFiles\csv0001.csv'
df = pd.read_csv(f).to_dict('list')
Try to go from this snippet for you. This is the 'From scratch' method. Please use a lib to do it properly!:
import os
input_path = r'test.csv'
header_line = 0
sep_csv_line = "\n\n"
sep_csv_column = ","
with open(os.path.join(os.path.dirname(__file__), input_path), 'r') as csv:
content = csv.read()
split = content.split(sep_csv_line)
columns = split[header_line].split(sep_csv_column)
print(f"{columns = }")
output = {}
for column in columns:
output[column] = []
for line in split[header_line+1:]:
print(f"{line = }")
elements = line.split(sep_csv_column)
print(f"{elements = }")
for i, column in enumerate(columns):
element = elements[i]
print(f"{element = }")
output[column].append(element)
print(f"{output = }")
print(f"{output['OsmID'] = }")
Here is the output console:
columns = ['OsmID', 'NewName', 'IdLocal']
line = '1020287758,NN1,Id0001'
elements = ['1020287758', 'NN1', 'Id0001']
element = '1020287758'
element = 'NN1'
element = 'Id0001'
line = '1021229973,NN2,Id0002'
elements = ['1021229973', 'NN2', 'Id0002']
element = '1021229973'
element = 'NN2'
element = 'Id0002'
line = '1025409497,NN3,Id0003'
elements = ['1025409497', 'NN3', 'Id0003']
element = '1025409497'
element = 'NN3'
element = 'Id0003'
output = {'OsmID': ['1020287758', '1021229973', '1025409497'], 'NewName': ['NN1', 'NN2', 'NN3'], 'IdLocal': ['Id0001', 'Id0002', 'Id0003']}
output['OsmID'] = ['1020287758', '1021229973', '1025409497']

How to remove an element from a JSON array using Python?

I'm currently trying to make a Chromebook rental application for my high school that stores checkout information in a JSON file. Everything works except removing data from the JSON array. I found a YouTube video(link) that I thought would work as a solution, so I followed along with that. However, whenever there's more than two elements and I enter anything higher than two, it doesn't delete anything. Even worse, when I enter the number one, it deletes everything but the zero index(whenever the array has more than two elements in it).
Here's the Python code:
def view_data(): # Prints JSON Array to screen
with open(filename, "r") as f:
data = json.load(f)
i = 0
for item in data:
name = item["name"]
chromebook = item["chromebook"]
check_out = item["time&date"]
print(f"Index Number: {i}")
print(f"Name : {name}")
print(f"Chromebook : {chromebook}")
print(f"Time Of Checkout: {check_out} ")
print("\n\n")
i = i + 1
def delete_data(): # Deletes an element from the array
view_data()
new_data = []
with open(filename, "r") as f:
data = json.load(f)
data_length = len(data) - 1
print("Which index number would you like to delete?")
delete_option = input(f"Select a number 0-{data_length}: ")
i = 0
for entry in data:
if i == int(delete_option):
pass
i = + 1
else:
new_data.append(entry)
i = + 1
with open(filename, "w") as f:
json.dump(new_data, f, indent=4)
Here's the JSON file code:
[
{
"name": "Tyler",
"chromebook": "12123223",
"time&date": "Check Out Time: 13:33:22 May-11-2021"
},
{
"name": "Craig",
"chromebook": "41222224",
"time&date": "Check Out Time: 13:33:34 May-11-2021"
},
{
"name": "Bill",
"chromebook": "3235223",
"time&date": "Check Out Time: 13:33:46 May-11-2021"
}
]
For example, say the user wanted to remove the second index in the JSON array. Is there a better way to implement that in my Python script?
I'm still a fairly new and learning Python developer, and if there's any better solution I'm open to suggestions. If you need for info, I'll be active.
First question
However, whenever there's more than two elements and I enter anything higher than two, it doesn't delete anything. Even worse, when I enter the number one, it deletes everything but the zero index(whenever the array has more than two elements in it).
Inside delete_data() you have two lines reading i = + 1, which just assignes +1 (i.e., 1) to i. Thus, you're never increasing your index. You probably meant to write either i = i+1 or i += 1.
def delete_data(): # Deletes an element from the array
view_data()
new_data = []
with open(filename, "r") as f:
data = json.load(f)
data_length = len(data) - 1
print("Which index number would you like to delete?")
delete_option = input(f"Select a number 0-{data_length}: ")
i = 0
for entry in data:
if i == int(delete_option):
i += 1 # <-- here
else:
new_data.append(entry)
i += 1 # <-- and here
with open(filename, "w") as f:
json.dump(new_data, f, indent=4)
Second question: further improvements
Is there a better way to implement that in my Python script?
First, you can get rid of manually increasing i by using the builtin enumerate generator. Second, you could make your functions reusable by giving them parameters - where does the filename in your code example come from?
# view_data() should probably receive `filename` as a parameter
def view_data(filename: str): # Prints JSON Array to screen
with open(filename, "r") as f:
data = json.load(f)
# iterate over i and data simultaneously
# alternatively, you could just remove i
for i, item in enumerate(data):
name = item["name"]
chromebook = item["chromebook"]
check_out = item["time&date"]
print(f"Index Number: {i}")
print(f"Name : {name}")
print(f"Chromebook : {chromebook}")
print(f"Time Of Checkout: {check_out} ")
print("\n\n")
# not needed anymore: i = i + 1
# view_data() should probably receive `filename` as a parameter
def delete_data(filename: str): # Deletes an element from the array
view_data()
new_data = []
with open(filename, "r") as f:
data = json.load(f)
data_length = len(data) - 1
print("Which index number would you like to delete?")
delete_option = input(f"Select a number 0-{data_length}: ")
# iterate over i and data simultaneously
for i, entry in enumerate(data):
if i != int(delete_option):
new_data.append(entry)
with open(filename, "w") as f:
json.dump(new_data, f, indent=4)
Furthermore, you could replace that for-loop by a list comprehension, which some may deem more "pythonic":
new_data = [entry for i, entry in enumerate(data) if i != int(delete_option)]
There are easier ways to delete an element by index from a Python list.
Given li = ["a", "b", "c"], you can delete element 1 ("b") by index in (at least) the following ways:
li.pop(1) # pop takes an index (defaults to last) and removes and returns the element at that index
del li[1] # the del keyword will also remove an element from a list
So, here's some updated code:
def view_data(): # Prints JSON Array to screen
with open(filename, "r") as f:
data = json.load(f)
i = 0
for item in data:
name = item["name"]
chromebook = item["chromebook"]
check_out = item["time&date"]
print(f"Index Number: {i}")
print(f"Name : {name}")
print(f"Chromebook : {chromebook}")
print(f"Time Of Checkout: {check_out} ")
print("\n\n")
i = i + 1
def delete_data(): # Deletes an element from the array
view_data()
with open(filename, "r") as f:
data = json.load(f)
data_length = len(data) - 1
print("Which index number would you like to delete?")
delete_option = input(f"Select a number 0-{data_length}: ")
del data[int(delete_option)] # or data.pop(int(delete_option))
with open(filename, "w") as f:
json.dump(data, f, indent=4)
import json
data = json.loads(jsonString) #convert json string to object
delete_option = input(f"Select a number 0-{data_length}: ")
del data[int(delete_option)]

Python iterating/look up dictionary unexpected behavior

I have a problem when I try to look up data in a csv dictionary. A list of dates and times are in one csv and it should look up the data to specific date and time in second csv. I look for an exact match and 22 next records. The problem is that it only fetch for first date and time and rest is not found even though I can see it's there. I feel like this has a very easy solution, but I can't think anything. It must be a problem in my iteration code.
Code:
import csv
csv_eph = open("G:\\db.csv")
csv_reader_eph = csv.reader(csv_eph, delimiter=",")
csv_dict_eph = csv.DictReader (csv_eph)
csv_matches = open("G:\\query.csv")
csv_reader_matches = csv.reader(csv_matches, delimiter=",")
csv_dict_matches = csv.DictReader (csv_matches)
result = []
var = 0
for row in csv_dict_matches:
datum = row["Date"]
cas = row["Time"]
result.append('\n')
result.append(row)
for eph in csv_dict_eph:
if str(eph["datum"]) == str(datum) and str(eph["cas"]) == str(cas):
var = 23
if var > 0:
result.append(eph)
var = var - 1
with open("G:\\compiled.txt", "w") as output:
for item in result:
output.write(str(item))
output.write('\n')
SOLUTION!
I implemented jasonharper solution and it works flawlesly, many thanks. It was indeed problem with end of dictionary. Now fixed it looks like this and works like intended:
import csv
csv_eph = open("G:\\db.csv")
csv_reader_eph = csv.reader(csv_eph, delimiter=",")
csv_dict_eph = csv.DictReader (csv_eph)
csv_matches = open("G:\\query.csv")
csv_reader_matches = csv.reader(csv_matches, delimiter=",")
csv_dict_matches = csv.DictReader (csv_matches)
#jasonharper
eph_list = []
for eph in csv_dict_eph:
eph_list.append(eph)
print (eph_list)
result = []
var = 0
for row in csv_dict_matches:
print (row)
datum = row["Date"]
cas = row["Time"]
result.append('\n')
result.append(row)
for eph in eph_list:
if str(eph["datum"]) == str(datum) and str(eph["cas"]) == str(cas):
var = 23
if var > 0:
result.append(eph)
var = var - 1
with open("G:\\compiled.txt", "w") as output:
for item in result:
output.write(str(item))
output.write('\n')
i believe changing:
csv_dict_eph = csv.DictReader (csv_eph)
to:
csv_dict_eph = list(csv.DictReader(csv_eph))
will fix the problem.

Python read XML file (near 50mb)

I'm parsing a XML String into CSV string but it's going very slow:
INDEX_COLUMN = "{urn:schemas-microsoft-com:office:spreadsheet}Index"
CELL_ELEMENT = "Cell"
DATA_ELEMENT = "Data"
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
rows.pop(0)
csv.append(join(",", header))
for row in rows:
values = get_cells_text(row)
csv.append(join(",", values))
return join("\n", csv)
def serialize_xml(xml):
return ET.fromstring(xml)
def get_cells_text(row):
keys = []
cells = normalize_row_cells(row)
for elm in cells:
keys.append(elm[0].text or "")
while len(keys) < 92:
keys.append("")
return keys
def normalize_row_cells(row):
cells = list(row)
updated_cells = copy.deepcopy(cells)
pos = 1
for elm in cells:
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
while index > pos:
empty_elm = ET.Element(CELL_ELEMENT)
child = ET.SubElement(empty_elm, DATA_ELEMENT)
child.text = ""
updated_cells.insert(pos - 1, empty_elm)
pos += 1
pos += 1
return updated_cells
The XML String sometimes miss a few columns and I need to iterate it to fill missing columns - every row must have 92 columns. That's why I have some helper functions to manipulate XML.
Right now I'm running my function with 4GB as Lambda and still getting timeout :(
Any idea on how to improve performance?
The normalize_row_cells constructs ElementTree Element instances but get_cells_text is only interested in each instance's child's text attribute, so I would consider changing normalize_row_cells to just return the text. Also, it's performing copies and calling list.insert: inserting elements into the middle of lists can be expensive, because each element after the insertion point must be moved.
Something like this (untested code) avoids making copies and insertions and returns only the required text, making get_cells_text redundant.
def normalize_row_cells(row):
cells = list(row)
updated_cells = []
pos = 1
for _ in range(0, 92):
elm = cells[pos - 1]
strIndexAttr = elm.get(INDEX_COLUMN)
index = int(strIndexAttr) if strIndexAttr else pos
if index == pos:
updated_cells.append(elm[0].text)
pos += 1
else:
update_cells.append("")
return updated_cells
If you can match your cells to their header names then using csv.DictWriter from the standard library might be even better (you need to profile to be sure).
import csv
import io
def parse_to_csv_string(xml):
print('parse_to_csv_string')
csv = []
parsed_data = serialize_xml(xml)
rows = list(parsed_data[1][0])
header = get_cells_text(rows[0])
with io.StringIO() as f:
writer = csv.DictWriter(f, fieldnames=header)
for row in rows:
row = get_cells_text(row)
writer.writerow(row)
f.seek(0)
data = f.read()
return data
def get_cells_text(row):
row_dict = {}
for cell in row:
column_name = get_column_name(cell) # <- can this be done?
row_dict[column_name] = elm[0].text or ""
return row_dict

Group and Check-mark using Python

I have several files, each of which has data like this (filename:data inside separated by newline):
Mike: Plane\nCar
Paula: Plane\nTrain\nBoat\nCar
Bill: Boat\nTrain
Scott: Car
How can I create a csv file using python that groups all the different vehicles and then puts a X on the applicable person, like:
Assuming those line numbers aren't in there (easy enough to fix if they are), and with an input file like following:
Mike: Plane
Car
Paula: Plane
Train
Boat
Car
Bill: Boat
Train
Scott: Car
Solution can be found here : https://gist.github.com/999481
import sys
from collections import defaultdict
import csv
# see http://stackoverflow.com/questions/6180609/group-and-check-mark-using-python
def main():
# files = ["group.txt"]
files = sys.argv[1:]
if len(files) < 1:
print "usage: ./python_checkmark.py file1 [file2 ... filen]"
name_map = defaultdict(set)
for f in files:
file_handle = open(f, "r")
process_file(file_handle, name_map)
file_handle.close()
print_csv(sys.stdout, name_map)
def process_file(input_file, name_map):
cur_name = ""
for line in input_file:
if ":" in line:
cur_name, item = [x.strip() for x in line.split(":")]
else:
item = line.strip()
name_map[cur_name].add(item)
def print_csv(output_file, name_map):
names = name_map.keys()
items = set([])
for item_set in name_map.values():
items = items.union(item_set)
writer = csv.writer(output_file, quoting=csv.QUOTE_MINIMAL)
writer.writerow( [""] + names )
for item in sorted(items):
row_contents = map(lambda name:"X" if item in name_map[name] else "", names)
row = [item] + row_contents
writer.writerow( row )
if __name__ == '__main__':
main()
Output:
,Mike,Bill,Scott,Paula
Boat,,X,,X
Car,X,,X,X
Plane,X,,,X
Train,,X,,X
Only thing this script doesn't do is keep the columns in order that the names are in. Could keep a separate list maintaining the order, since maps/dicts are inherently unordered.
Here is an example of how-to parse these kind of files.
Note that the dictionary is unordered here. You can use ordered dict (in case of Python 3.2 / 2.7) from standard library, find any available implmentation / backport in case if you have older Python versions or just save an order in additional list :)
data = {}
name = None
with open(file_path) as f:
for line in f:
if ':' in line: # we have a name here
name, first_vehicle = line.split(':')
data[name] = set([first_vehicle, ]) # a set of vehicles per name
else:
if name:
data[name].add(line)
# now a dictionary with names/vehicles is available
# let's convert it to simple csv-formatted string..
# a set of all available vehicles
vehicles = set(v for vlist in data.values()
for v in vlist)
for name in data:
name_vehicles = data[name]
csv_vehicles = ''
for v in vehicles:
if v in name_vehicles:
csv_vehicles += v
csv_vehicles += ','
csv_line = name + ',' + csv_vehicles
Assuming that the input looks like this:
Mike: Plane
Car
Paula: Plane
Train
Boat
Car
Bill: Boat
Train
Scott: Car
This python script, places the vehicles in a dictionary, indexed by the person:
#!/usr/bin/python
persons={}
vehicles=set()
with open('input') as fd:
for line in fd:
line = line.strip()
if ':' in line:
tmp = line.split(':')
p = tmp[0].strip()
v = tmp[1].strip()
persons[p]=[v]
vehicles.add(v)
else:
persons[p].append(line)
vehicles.add(line)
for k,v in persons.iteritems():
print k,v
print 'vehicles', vehicles
Result:
Mike ['Plane', 'Car']
Bill ['Boat', 'Train']
Scott ['Car']
Paula ['Plane', 'Train', 'Boat', 'Car']
vehicles set(['Train', 'Car', 'Plane', 'Boat'])
Now, all the data needed are placed in data-structures. The csv-part is left as an exercise for the reader :-)
The most elegant and simple way would be like so:
vehiclesToPeople = {}
people = []
for root,dirs,files in os.walk('/path/to/folder/with/files'):
for file in files:
person = file
people += [person]
path = os.path.join(root, file)
with open(path) as f:
for vehicle in f:
vehiclesToPeople.setdefault(vehicle,set()).add(person)
people.sort()
table = [ ['']+people ]
for vehicle,owners in peopleToVehicles.items():
table.append([('X' if p in vehiclesToPeople[vehicle] else '') for p in people])
csv = '\n'.join(','.join(row) for row in table)
You can do pprint.pprint(table) as well to look at it.

Categories