I have a JSON File (>1GB) and I have another CSV File with one matching column (i.e ID). I need to update the JSON File by mapping CSV with JSON.
The approach I thought at first was to convert the json to csv and then overwrite the csv, but since the file is huge, it's not the most optimized way. I am supposed to use Python.
import csv
import json
id = []
qrank = []
def readingCsvFile():
with open('qrank.csv', 'r') as csvFile:
dataCsv = csv.reader(csvFile)
for row in dataCsv:
id.append(row[0])
qrank.append(row[1])
dataJson = [json.loads(line) for line in open('enhanced-wikipois','r', encoding='UTF-8')]
records = len(dataJson)
readingCsvFile()
for i in range(records):
x = dataJson[i]['id']
if (x in id):
pos = id.index(x)
dataJson[i]['wikiQRank'] = qrank[pos]
print(dataJson)
The size of the file is not really relevant. What's important is the number of JSON objects and the number of "qrank" values.
If you build a dictionary based on id and rank from the CSV file then the subsequent lookups will be much more efficient.
There are a number of other efficiencies that you could implement.
import csv
import json
CSVFILE = '/Volumes/G-Drive/qrank.csv'
JSONLFILE = '/Volumes/G-Drive/enhanced-wikipois'
def read_csv(filename):
with open(filename, newline='') as data:
reader = csv.reader(data)
return {_id: rank for _id, rank, *_ in reader}
def read_jsonl(filename):
with open(filename) as data:
return [json.loads(line) for line in data]
id_dict = read_csv(CSVFILE)
json_data = read_jsonl(JSONLFILE)
for j in json_data:
if (_id := j.get('id')) is not None:
if (rank := id_dict.get(_id)) is not None:
j['wikiQRank'] = rank
print(json_data)
Related
I am trying to make a program that can save the results of a filtered JSON file as a CSV. Right now my function only saves the keys of the JSON to the CSV file.
Ideally I want the function to take two arguments: column (key) it is searching in; and the item (value) it is searching for.
This is my current function:
def save_csv(key, value):
with open('db.json') as json_file:
info = json.load(json_file)
test = info['data']
csv_file = open('test.csv', 'w')
csv_writer = csv.writer(csv_file)
count = 0
for e in test:
if count == 0:
header_csv = e.keys()
csv_writer.writerow(header_csv)
count += 1
for e in key:
if e == value:
csv_writer.writerow(e.values())
csv_file.close()
How could I change this function to make it save the filtered results in a CSV?
No matter what changes I try to make, it will only save the keys to the header of the CSV. None of the results I am filtering for will save to the CSV.
def save_csv(key, value):
with open('db.json') as json_file:
info = json.load(json_file)
test = info['data']
with open('test.csv', 'w', newline='') as csv_file:
csv_writer = csv.writer(csv_file)
for n,v in enumerate(test):
if not n:
header_csv = e.keys()
csv_writer.writerow(header_csv)
if key in v and v.get(key)==value:
csv_writer.writerow(e.values())
I am in the process of doing a conversion of JSON to XML using Python.
I'm giving a presentation of how by starting with one file, CSV, you can convert it through multiple formats in a chain. So, CSV to JSON, that JSON to XML, XML to the next file type in the chain, etc, back to CSV.
I obtained a public domain CSV file from Kaggle (https://www.kaggle.com/canggih/anime-data-score-staff-synopsis-and-genre), then converted it to JSON.
From JSON, I am trying to convert to XML and write to an outfile.
I converted the CSV to JSON using this (no formatting, just a straight conversion):
#This should convert CSV to JSON
import json, os
import pandas as pd
import csv
df = pd.read_csv('dataanime.csv')
df.to_json(r'sassyg_data_Anime.json')
Then, I created my JSON to XML file:
#With help from instructor and CodeSpeedy
#https://www.codespeedy.com/how-to-convert-json-to-xml-using-python/
#Import libraries
import json as j
import xml.etree.ElementTree as et
#load in the json file
with open("sassyg_data_Anime.json") as json_file_format:
d = j.load(json_file_format)
#create the main container element for the entire XML file
r = et.Element("Work")
#creates the subelements for each part of the json file
et.SubElement(r,"Title").text = d["Title"]
et.SubElement(r,"Type").text = d["Type"]
et.SubElement(r,"Episodes").text = d["Episodes"]
et.SubElement(r,"Status").text = d["Status"]
et.SubElement(r,"Start airing").text = str(d["Start airing"])
et.SubElement(r,"End airing").text = str(d["End airing"])
et.SubElement(r,"Starting season").text = d["Starting season"]
et.SubElement(r,"Broadcast time").text = d["Broadcast time"]
et.SubElement(r,"Producers").text = d["Producers"]
et.SubElement(r,"Licensors").text = d["Licensors"]
et.SubElement(r,"Studios").text = d["Studios"]
et.SubElement(r,"Sources").text = d["Sources"]
et.SubElement(r,"Genres").text = d["Genres"]
et.SubElement(r,"Duration").text = str(d["Duration"])
et.SubElement(r,"Rating").text = d["Rating"]
et.SubElement(r,"Score").text = str(d["Score"])
et.SubElement(r,"Scored by").text = str(d["Scored by"])
et.SubElement(r,"Members").text = str(d["Members"])
et.SubElement(r,"Favorites").text = str(d["Favorites"])
et.SubElement(r,"Description").text = d["Description"]
#create the element tree/info for the write file
a = et.ElementTree(r)
#ERROR ERROR
#structure the output for xml via tostring rather than str
#Cannot write an ElementTree to file, errors out
#This was one solution I came up with, still errors out
a_xml_str = et.tostring(a)
print(a_xml_str)
#This might error out as well, I can't get the program to get to this point
#write file it should go to
outfile = open("json_to_xml.xml", 'w', encoding='utf-8')
outfile.write(a_xml_str)
outfile.close()
The error I get is:
Traceback (most recent call last):
File "F:\Data_Int_Final\Gardner_json_to_xml\convert_json_to_xml.py", line 44, in <module>
a_xml_str = et.tostring(a)
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\xml\etree\ElementTree.py", line 1109, in tostring
ElementTree(element).write(stream, encoding,
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\xml\etree\ElementTree.py", line 748, in write
serialize(write, self._root, qnames, namespaces,
File "C:\Users\user\AppData\Local\Programs\Python\Python39\lib\xml\etree\ElementTree.py", line 873, in _serialize_xml
tag = elem.tag
AttributeError: 'ElementTree' object has no attribute 'tag'
This is the latest version of the code I've tried. Can anyone see a solution?
Update:
I have two other ways to convert to the starting JSON file, would one of these be a better approach?
import json
import csv
def make_json(csvFilePath, jsonFilePath):
data = {}
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
key = rows['Title']
data[key] = rows
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
csvFilePath = r'dataanime.csv'
jsonFilePath = r'dataAnime.json'
make_json(csvFilePath, jsonFilePath)
which errors out my XML conversion when I use this JSON file with it:
Traceback (most recent call last):
File "F:\Data_Int_Final\convert_json_to_xml.py", line 16, in <module>
et.SubElement(r,"Title").text = d["Title"]
KeyError: 'Title'
or:
import csv
import json
import time
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
#read csv file
with open(csvFilePath, encoding='utf-8') as csvf:
#load csv file data using csv library's dictionary reader
csvReader = csv.DictReader(csvf)
#convert each csv row into python dict
for row in csvReader:
#add this python dict to json array
jsonArray.append(row)
#convert python jsonArray to JSON String and write to file
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonString = json.dumps(jsonArray, indent=4)
jsonf.write(jsonString)
csvFilePath = r'dataanime.csv'
jsonFilePath = r'g_d_anime.json'
start = time.perf_counter()
csv_to_json(csvFilePath, jsonFilePath)
finish = time.perf_counter()
print(f"Conversion of all rows completed successfully in {finish - start:0.4f} seconds")
which errors out my XML conversion when I use this created JSON file with it:
Traceback (most recent call last):
File "F:\Data_Int_Final\convert_json_to_xml.py", line 16, in <module>
et.SubElement(r,"Title").text = d["Title"]
TypeError: list indices must be integers or slices, not str
It's simpler to work with the CSV file and generate a XML file from that directly.
Try something like this:
import csv
import xml.etree.ElementTree as et
root = et.Element('WorksXML')
tree = et.ElementTree(root)
with open("dataanime.csv", "r", encoding="utf-8") as fin:
reader = csv.DictReader(fin)
for row in reader:
r = et.SubElement(root, "Work")
# iterate over each of the fields and add to the XML element
for field in reader.fieldnames:
et.SubElement(r, field.replace(' ', '_')).text = row[field]
with open("csv_to_xml.xml", 'wb') as fout:
tree.write(fout, xml_declaration=True, encoding='utf-8')
This generates an XML file with each "work" as a separate sub-element under the root element.
<?xml version="1.0" encoding="utf-8"?>
<WorksXML>
<Work>
<Title>Fullmetal Alchemist: Brotherhood</Title>
<Type>TV</Type>
<Episodes>64</Episodes>
<Status>Finished Airing</Status>
<Start_airing>4/5/2009</Start_airing>
<End_airing>7/4/2010</End_airing>
<Starting_season>Spring</Starting_season>
...
For the CSV to JSON conversion, the first approach creates a dictionary with titles as keys and the second approach creates an array with each item an object with all the attributes.
If any of the works have a duplicate title then the first approach will overwrite the duplicate entries. If not then it's just a matter of how you want to access the data in the JSON file as a dictionary or a list. If you want to generate XML from the JSON file then the second approach with an array will be the better option.
To convert the array-based JSON file to XML then this will do the job.
import json
import xml.etree.ElementTree as ET
def json_to_xml(jsonFilePath, xmlFilePath):
root = ET.Element('WorksXML')
tree = ET.ElementTree(root)
with open(jsonFilePath, "r", encoding="utf-8") as fin:
jdata = json.load(fin)
for obj in jdata:
r = ET.SubElement(root, "Work")
for key, value in obj.items():
ET.SubElement(r, key.replace(' ', '_')).text = value
with open(xmlFilePath, 'wb') as fout:
tree.write(fout, xml_declaration=True, encoding='utf-8')
jsonFilePath = 'g_d_anime.json'
xmlFilePath = 'g_d_anime.xml'
json_to_xml(jsonFilePath, xmlFilePath)
I have a CSV file like this:
#Description
#Param1: value
#Param2: value
...
#ParamN: value
Time (s),Header1,Header2
243.41745,3,1
243.417455,3,5
243.41746,7,6
...
I need to read it with Python without using Pandas as requirement. How to read the CSV data itself ignoring the initial lines until the empty one? I am using the code below that successfully reads the metadata.
def read(file_path: str):
'''Read the data of the Digilent WaveForms Logic Analyzer Acquisition
(moodel Discovery2).
Parameter: File path.
'''
meta = {}
RE_CONFIG = re.compile(r'^#(?P<name>[^:]+)(: *(?P<value>.+)\s*$)*')
with open(file_path, 'r') as fh:
# Read the metadata and description at the beginning of the file.
for line in fh.readlines():
line = line.strip()
if not line:
break
config = RE_CONFIG.match(line)
if config:
if not config.group('value'):
meta.update({'Description': config.group('name')})
else:
meta.update({config.group('name'): config.group('value')})
# Read the data it self.
data = csv.DictReader(fh, delimiter=',')
return data, meta
This seems to work. I had to change for line in fh.readlines(): to for line in fh: the portion that reads the meta-data so line with data wouldn't be read, then create the DictReader and use it to get the data.
import csv
from pprint import pprint, pp
import re
def read(file_path: str):
'''Read the data of the Digilent WaveForms Logic Analyzer Acquisition
(moodel Discovery2).
Parameter: File path.
'''
meta = {}
RE_CONFIG = re.compile(r'^#(?P<name>[^:]+)(: *(?P<value>.+)\s*$)*')
with open(file_path, 'r') as fh:
# Read the metadata and description at the beginning of the file.
for line in fh: # CHANGED
line = line.strip()
if not line:
break
config = RE_CONFIG.match(line)
if config:
if not config.group('value'):
meta.update({'Description': config.group('name')})
else:
meta.update({config.group('name'): config.group('value')})
# Read the data itself.
reader = csv.DictReader(fh, delimiter=',')
data = list(reader)
return data, meta
res = read('mixed.csv')
pprint(res)
I am trying to take a CSV and create a list of dictionaries in python with the CSV coming from S3. Code is as follows:
import os
import boto3
import csv
import json
from io import StringIO
import logging
import time
s3 = boto3.resource('s3')
s3Client = boto3.client('s3','us-east-1')
bucket = 'some-bucket'
key = 'some-key'
obj = s3Client.get_object(Bucket = bucket, Key = key)
lines = obj['Body'].read().decode('utf-8').splitlines(True)
newl = []
for line in csv.reader(lines, quotechar='"', delimiter=',',quoting=csv.QUOTE_ALL,skipinitialspace=True, escapechar="\\"):
newl.append(line)
fieldnames = newl[0]
newl1 = newl[1:]
reader = csv.DictReader(newl1,fieldnames)
out = json.dumps([row for row in reader])
jlist1 = json.loads(out)
but this gives me the error:
iterator should return strings, not list (did you open the file in text mode?)
if I alter the for loop to this:
for line in csv.reader(lines, quotechar='"', delimiter=',',quoting=csv.QUOTE_ALL,skipinitialspace=True, escapechar="\\"):
newl.append(','.join(line))
then it works, however there are some fields that have commas in them so this completely screws up the schema and shifts the data. For example:
|address1 |address2 |state|
------------------------------
|123 Main st|APT 3, Fl1|TX |
becomes:
|address1 |address2 |state|null|
-----------------------------------
|123 Main st|APT 3 |Fl1 |TX |
Where am I going wrong?
The problem is that you are building a list of lists here :
newl.append(line)
and as the error says : iterator should return strings, not list
so try to cast line as a string:
newl.append(str(line))
Hope this helps :)
I ended up changing the code to this:
obj = s3Client.get_object(Bucket = bucket, Key = key)
lines1 = obj['Body'].read().decode('utf-8').split('\n')
fieldnames = lines1[0].replace('"','').split(',')
testls = [row for row in csv.DictReader(lines1[1:], fieldnames)]
out = json.dumps([row for row in testls])
jlist1 = json.loads(out)
And got the desired result
I have a csv file where each record is a LinkedIn contact. I have to recreate another csv file where each contact it was reached only after a specific date (ex all the contact that are connected to me after 1/04/2017).
So this is my implementation:
def import_from_csv(file):
key_order = ("FirstName","LastName","EmailAddress","Company","ConnectedOn")
linkedin_contacts = []
with open(file, encoding="utf8") as csvfile:
reader=csv.DictReader(csvfile, delimiter=',')
for row in reader:
single_person = {"FirstName": row["FirstName"], "LastName": row["LastName"],
"EmailAddress": row["EmailAddress"], "Company": row["Company"],
"ConnectedOn": parser.parse(row["ConnectedOn"])}
od = OrderedDict((k, single_person[k]) for k in key_order)
linkedin_contacts.append(od)
return linkedin_contacts
the first script give to me a list of ordered dict, i dont know if the way i used to achive the correct order is good, also seeing some example (like here) i'm not using the od.update method, but i dont think i need it, is it correct?
Now i wrote a second function to filter the list:
def filter_by_date(connections):
filtered_list = []
target_date = parser.parse("01/04/2017")
for row in connections:
if row["ConnectedOn"] > target_date:
filtered_list.append(row)
return filtered_list
Am I doing this correctly?
Is there a way to optimize the code? Thanks
First point: you don't need the OrderedDict at all, just use a csv.DictWriter to write the filtered csv.
fieldnames = ("FirstName","LastName","EmailAddress","Company","ConnectedOn")
with open("/apth/to/final.csv", "wb") as f:
writer = csv.DictWriter(f, fieldnames)
writer.writeheader()
writer.writerows(filtered_contacts)
Second point: you don't need to create a new dict from the one yielded by the csv reader, just update the ConnectedOn key in place :
def import_from_csv(file):
linkedin_contacts = []
with open(file, encoding="utf8") as csvfile:
reader=csv.DictReader(csvfile, delimiter=',')
for row in reader:
row["ConnectedOn"] = parser.parse(row["ConnectedOn"])
linkedin_contacts.append(row)
return linkedin_contacts
And finally, if all you have to do is take the source csv, filter out records on ConnectedOn and write the result, you don't need to load the whole source in memory, create a filtered list (in memory again) and write the filtered list, you can stream the whole operation:
def filter_csv(source_path, dest_path, date):
fieldnames = ("FirstName","LastName","EmailAddress","Company","ConnectedOn")
target = parser.parse(date)
with open(source_path, "rb") as source, open(dest_path, "wb") as dest:
reader = csv.DictReader(source)
writer = csv.DictWriter(dest, fieldnames)
# if you want a header line with the fieldnames - else comment it out
writer.writeheaders()
for row in reader:
row_date = parser.parse(row["ConnectedOn"])
if row_date > target:
writer.writerow(row)
And here you are, plain and simple.
NB : I don't know what "parser.parse()" is but as others answers mention, you'd probably be better using the datetime module instead.
For filtering you could use filter() function:
def filter_by_date(connections):
target_date = datetime.strptime("01/04/2017", '%Y/%m/%d').date()
return list(filter(lambda x: x["ConnectedOn"] > target_date, connections))
And instead of creating simple dict and then fill its values into OrderedDict you could write values directly to the OrderedDict:
for row in reader:
od = OrderedDict()
od["FirstName"] = row["FirstName"]
od["LastName"] = row["LastName"]
od["EmailAddress"] = row["EmailAddress"]
od["Company"] = row["Company"]
od["ConnectedOn"] = datetime.strptime(row["ConnectedOn"], '%Y/%m/%d').date()
linkedin_contacts.append(od)
If you know date format you don't need python_dateutil, you could use built-in datetime.datetime.strptime() with needed format.
Because you don't precise the format string.
Use :
from datetime import datetime
format = '%d/%m/%Y'
date_text = '01/04/2017'
# inverse by datetime.strftime(format)
datetime.strptime(date_text, format)
#....
# with format as global
for row in reader:
od = OrderedDict()
od["FirstName"] = row["FirstName"]
od["LastName"] = row["LastName"]
od["EmailAddress"] = row["EmailAddress"]
od["Company"] = row["Company"]
od["ConnectedOn"] = strptime(row["ConnectedOn"], format)
linkedin_contacts.append(od)
Do:
def filter_by_date(connections, date_text):
target_date = datetime.strptime(date_text, format)
return [x for x in connections if x["ConnectedOn"] > target_dat]