How to read CSV after metadata? - python

I have a CSV file like this:
#Description
#Param1: value
#Param2: value
...
#ParamN: value
Time (s),Header1,Header2
243.41745,3,1
243.417455,3,5
243.41746,7,6
...
I need to read it with Python without using Pandas as requirement. How to read the CSV data itself ignoring the initial lines until the empty one? I am using the code below that successfully reads the metadata.
def read(file_path: str):
'''Read the data of the Digilent WaveForms Logic Analyzer Acquisition
(moodel Discovery2).
Parameter: File path.
'''
meta = {}
RE_CONFIG = re.compile(r'^#(?P<name>[^:]+)(: *(?P<value>.+)\s*$)*')
with open(file_path, 'r') as fh:
# Read the metadata and description at the beginning of the file.
for line in fh.readlines():
line = line.strip()
if not line:
break
config = RE_CONFIG.match(line)
if config:
if not config.group('value'):
meta.update({'Description': config.group('name')})
else:
meta.update({config.group('name'): config.group('value')})
# Read the data it self.
data = csv.DictReader(fh, delimiter=',')
return data, meta

This seems to work. I had to change for line in fh.readlines(): to for line in fh: the portion that reads the meta-data so line with data wouldn't be read, then create the DictReader and use it to get the data.
import csv
from pprint import pprint, pp
import re
def read(file_path: str):
'''Read the data of the Digilent WaveForms Logic Analyzer Acquisition
(moodel Discovery2).
Parameter: File path.
'''
meta = {}
RE_CONFIG = re.compile(r'^#(?P<name>[^:]+)(: *(?P<value>.+)\s*$)*')
with open(file_path, 'r') as fh:
# Read the metadata and description at the beginning of the file.
for line in fh: # CHANGED
line = line.strip()
if not line:
break
config = RE_CONFIG.match(line)
if config:
if not config.group('value'):
meta.update({'Description': config.group('name')})
else:
meta.update({config.group('name'): config.group('value')})
# Read the data itself.
reader = csv.DictReader(fh, delimiter=',')
data = list(reader)
return data, meta
res = read('mixed.csv')
pprint(res)

Related

Updating Json Object using data from a CSV File

I have a JSON File (>1GB) and I have another CSV File with one matching column (i.e ID). I need to update the JSON File by mapping CSV with JSON.
The approach I thought at first was to convert the json to csv and then overwrite the csv, but since the file is huge, it's not the most optimized way. I am supposed to use Python.
import csv
import json
id = []
qrank = []
def readingCsvFile():
with open('qrank.csv', 'r') as csvFile:
dataCsv = csv.reader(csvFile)
for row in dataCsv:
id.append(row[0])
qrank.append(row[1])
dataJson = [json.loads(line) for line in open('enhanced-wikipois','r', encoding='UTF-8')]
records = len(dataJson)
readingCsvFile()
for i in range(records):
x = dataJson[i]['id']
if (x in id):
pos = id.index(x)
dataJson[i]['wikiQRank'] = qrank[pos]
print(dataJson)
The size of the file is not really relevant. What's important is the number of JSON objects and the number of "qrank" values.
If you build a dictionary based on id and rank from the CSV file then the subsequent lookups will be much more efficient.
There are a number of other efficiencies that you could implement.
import csv
import json
CSVFILE = '/Volumes/G-Drive/qrank.csv'
JSONLFILE = '/Volumes/G-Drive/enhanced-wikipois'
def read_csv(filename):
with open(filename, newline='') as data:
reader = csv.reader(data)
return {_id: rank for _id, rank, *_ in reader}
def read_jsonl(filename):
with open(filename) as data:
return [json.loads(line) for line in data]
id_dict = read_csv(CSVFILE)
json_data = read_jsonl(JSONLFILE)
for j in json_data:
if (_id := j.get('id')) is not None:
if (rank := id_dict.get(_id)) is not None:
j['wikiQRank'] = rank
print(json_data)

How can I edit my code to print out the content of my created json file?

My program takes a csv file as input and writes it as an output file in json format. On the final line, I use the print command to output the contents of the json format file to the screen. However, it does not print out the json file contents and I don't understand why.
Here is my code that I have so far:
import csv
import json
def jsonformat(infile,outfile):
contents = {}
csvfile = open(infile, 'r')
reader = csvfile.read()
for m in reader:
key = m['No']
contents[key] = m
jsonfile = open(outfile, 'w')
jsonfile.write(json.dumps(contents))
csvfile.close()
jsonfile.close()
return jsonfile
infile = 'orders.csv'
outfile = 'orders.json'
output = jsonformat(infile,outfile)
print(output)
Your function returns the jsonfile variable, which is a file.
Try adding this:
jsonfile.close()
with open(outfile, 'r') as file:
return file.read()
Your function returns a file handle to the file jsonfile that you then print. Instead, return the contents that you wrote to that file. Since you opened the file in w mode, any previous contents are removed before writing the new contents, so the contents of your file are going to be whatever you just wrote to it.
In your function, do:
def jsonformat(infile,outfile):
...
# Instead of this:
# jsonfile.write(json.dumps(contents))
# do this:
json_contents = json.dumps(contents, indent=4) # indent=4 to pretty-print
jsonfile.write(json_contents)
...
return json_contents
Aside from that, you aren't reading the CSV file the correct way. If your file has a header, you can use csv.DictReader to read each row as a dictionary. Then, you'll be able to use for m in reader: key = m['No']. Change reader = csvfile.read() to reader = csv.DictReader(csvfile)
As of now, reader is a string that contains all the contents of your file. for m in reader makes m each character in this string, and you cannot access the "No" key on a character.
a_file = open("sample.json", "r")
a_json = json.load(a_file)
pretty_json = json.dumps(a_json, indent=4)
a_file.close()
print(pretty_json)
Using this sample to print the contents of your json file. Have a good day.

Stream Bytes chunks to csv rows in python

I need to process a large remote CSV line by line without downloading it entirely.
Below is the closest I got.
I iterate byte chunks from Azure, and have some code to handle truncated lines.
But this cannot work if csv values contain a newline as I am not able to discernate between value newlines and csv newlines.
# this does not work
def azure_iter_lines(logger_scope, client, file_path):
# get a StorageStreamDownloader
# https://learn.microsoft.com/en-us/python/api/azure-storage-file-datalake/azure.storage.filedatalake.storagestreamdownloader?view=azure-python
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
truncated_line = ''
for chunk in file_handle.chunks():
# have the previous truncated line appended to the next block
chunk_txt = truncated_line + chunk.decode("utf-8")
lines = chunk_txt.split('\n') # THIS CANNOT WORK AS VALUES CONTAIN NEWLINES
for line in lines[0:len(lines)-2]:
yield line
truncated_line = lines[len(lines)-1]
# process the last chunk (same code)
chunk_txt = truncated_line
lines = chunk_txt.split('\n') # THIS CANNOT WORK AS VALUES CONTAIN NEWLINES
for line in lines[0:len(lines)-2]:
yield line
truncated_line = lines[len(lines)-1]
Ideally I would use csv.DictReader() but I was not able to to so as it downloads the file entirely.
# this does not work
def azure_iter_lines(logger_scope, client, file_path):
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
buffer = io.BytesIO()
file_handle.readinto(buffer) # THIS DOWNLOADS THE FILE ENTIRELY
csvreader = csv.DictReader(buffer, delimiter=";")
return csvreader
Here is an update using some hints by #H.Leger
Please note that this still does not work
file_client = client.get_file_client(file_path)
file_handle = file_client.download_file()
stream = codecs.iterdecode(file_handle.chunks(), 'utf-8')
csvreader = csv.DictReader(stream, delimiter=";")
for row in csvreader:
print(row)
# => _csv.Error: new-line character seen in unquoted field - do you need to open the file in universal-newline mode?
EDIT: Final solution based on #paiv answer
EDIT: Updated solution to use io instead of codecs for faster parsing
import io
import csv
import ctypes as ct
# bytes chunk iterator to python stream adapter
# https://stackoverflow.com/a/67547597/2523414
class ChunksAdapter:
def __init__(self, chunks):
self.chunks = chunks
self.buf = b''
self.closed = False
def readable(self):
return True
def writable(self):
return False
def seekable(self):
return False
def close(self):
self.closed = True
def read(self, size):
if not self.buf:
self.buf = next(self.chunks, b'')
res, self.buf = self.buf[:size], self.buf[size:]
return res
# get the downloader object
file_client = client.get_file_client(file_path)
downloader = file_client.download_file()
# adapt the downloader iterator to a byte stream
file_object = ChunksAdapter(downloader.chunks())
# decode bytes stream to utf-8
text_stream = io.TextIOWrapper(file_object, encoding='utf-8', newline='')
# update csv field limit to handle large fields
# https://stackoverflow.com/a/54517228/2523414
csv.field_size_limit(int(ct.c_ulong(-1).value // 2))
csvreader = csv.DictReader(text_stream, delimiter=";", quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in csvreader:
print(row)
Disclaimer: I know little Azure specifics. Ultimately, you would want to stream separate chunks too.
In Python, given a file object, you can set up CSV streaming this way:
import codecs
import csv
codec = codecs.getreader('utf-8')
text_stream = codec(file_object)
csvreader = csv.DictReader(text_stream)
Now you can iterate over csvreader, and it will read from file_object in a streaming fasion.
Edit: as #Martijn Pieters suggested, we can gain performance with TextIOWrapper instead of codecs:
text_stream = io.TextIOWrapper(file_object, encoding='utf-8', newline='')
Check the comment in csv module on newline parameter.
But Azure's StorageStreamDownloader does not provide python's file object interface. It has .chunks() generator (which I assume will invoke separate HTTP request to retrieve next chunk).
You can adapt .chunks() into a file object with a simple adapter:
class ChunksAdapter:
def __init__(self, chunks):
self.chunks = chunks
self.buf = b''
def read(self, size):
if not self.buf:
self.buf = next(self.chunks, b'')
res, self.buf = self.buf[:size], self.buf[size:]
return res
And use like
downloader = file_client.download_file()
file_object = ChunksAdapter(downloader.chunks())
Be sure to configure DictReader for the appropriate CSV dialect.
And set appropriate values for max_single_get_size, max_chunk_get_size on the blob client.
I believe the requests package can be useful for you. Using the stream option while getting your file and the Response.iter_lines() function should do what you need :
import codecs
import csv
import requests
url = "https://navitia.opendatasoft.com//explore/dataset/all-datasets/download?format=csv"
r = requests.get(url, stream=True) # using the stream option to avoid loading everything
try:
buffer = r.iter_lines() # iter_lines() will feed you the distant file line by line
reader = csv.DictReader(codecs.iterdecode(buffer, 'utf-8'), delimiter=';')
for row in reader:
print(row) # Do stuff here
finally:
r.close()

How to make json file a list in Python

as said, I'd like to open a json file and make it into a list, in order to append new elements to it and then dump all back into the json file.
Here is my code(the commented part is what I previously tried):
class Carta:
def __init__(self,filename):
self.__filename = filename
self.__lista = []
# try:
# f = open(self.__filename,"r")
# except:
# f = open(self.__filename, "w")
# f.close()
# f = open(self.__filename, "r")
with open(self.__filename) as file:
self.__lista = json.load(file)
# read=json.load(f)
# for c in leggi:
# self.__lista.append(c)
# print(self.__lista)
# f.close()
def add(self, c):
self.__lista.append(c)
def save(self):
f = open(self.__filename, "w")
for c in self.__lista:
f.write("%s\n" % str(c))
f.close()
It wouldn't work if you read from a JSON file, json list and then write custom string. Because next time you read the JSON file it's gonna fail.
So, during write/save you should make it json itself. Here's the code the explains how to do it.
import json
class Carta:
def __init__(self, filename):
self.__filename = filename
self.__lista = list()
self.read_from_json_file()
def read_from_json_file(self):
with open(self.__filename) as file:
self.__lista = json.load(file)
def write_to_json_file(self):
with open(self.__filename, 'w') as f:
json.dump(self.__lista, f)
def add(self, value):
self.__lista.append(value)
The reason you should use with open(filename, mode) as f: instead of f = open(filename) is because at the end of with block the file is automatically closed. Otherwise you've to call f.close() every time you open a file.
json.load - reads json data from file, converts to python data type/structure.
json.dump - read python data type/structure, converts it into string and stores it in the file (file handle) and saves the file.
Using pdb to trace errors
import json
import pdb
class Carta:
def __init__(self, filename):
self.__filename = filename
self.__lista = list()
self.read_from_json_file()
def read_from_json_file(self):
pdb.set_trace() # to pause execution and start debugger
# When paused,
# type n to continue to next line,
# type c to continue execution or to continue to the next loop
# type b <file_name>:<line_number> to add another break point, where <file_name> and <line_number> are place holders
# Example, b /home/username/hello.py:43, will add breakpoint at 43 line of hello.py in /home/username path
# type q to quit debugger and halt execution
with open(self.__filename) as file:
self.__lista = json.load(file)
def write_to_json_file(self):
with open(self.__filename, 'w') as f:
json.dump(self.__lista, f)
def add(self, value):
# Second breakpoint
pdb.set_trace()
self.__lista.append(value)
Or just run your file with
python -m pdb file.py and then add breakpoints. It will pause in the first line itself and return you a (pdb) console where you can add breakpoint.
import json
#read from file
with open("demofile.txt", "r") as f: x = f.read()
#parse
y = json.loads(x)
#edit
y["user"] = { "fname": "John", "lname": "Who"}
#save to file
with open("demofile.txt", "w") as f: f.write(json.dumps(y))
https://repl.it/#KrzysztofPecyna/PythonJsonExample
To read JSON from a file:
import json
with open('data.txt') as json_file:
data = json.load(json_file)
To add new data:
data['key'] = "value"
To write JSON to a file:
with open('data.txt', 'w') as outfile:
json.dump(data, outfile)

Save file without first and last double quotes

I am trying to save my data to a file. My problem is the file i saved contains double quotes at the first and the last of a line. I have tried many ways to solve it from str.replace(), strip, csv to json, pickle. However, the problem has been still persistent. I have got stuck with it. Please help me. I will detail my problem below.
Firstly, I have a file called angles.txt like that:
{'left_w0': -2.6978887076110842, 'left_w1': -1.3257428944152834, 'left_w2': -1.7533400385498048, 'left_e0': 0.03566505327758789, 'left_e1': 0.6948932961 181641, 'left_s0': -1.1665923878540039, 'left_s1': -0.6726505747192383}
{'left_w0': -2.6967382220214846, 'left_w1': -0.8440729275695802, 'left_w2': -1.7541070289428713, 'left_e0': 0.036048548474121096, 'left_e1': 0.166820410 49194338, 'left_s0': -0.7731263162109375, 'left_s1': -0.7056311616210938}
I read line by line from the text file and transfer to a dict variable called data. Here is the reading file code:
def read_data_from_file(file_name):
data = dict()
f = open(file_name, 'r')
for index_line in range(1, number_lines +1):
data[index_line] = eval(f.readline())
f.close()
return data
Then I changed something in the data. Something like data[index_line]['left_w0'] = data[index_line]['left_w0'] + 0.0006. After that I wrote my data into another text file. Here is the code:
def write_data_to_file(data, file_name)
f = open(file_name, 'wb')
data_convert = dict()
for index_line in range(1, number_lines):
data_convert[index_line] = repr(data[index_line])
data_convert[index_line] = data_convert[index_line].replace('"','') # I also used strip
json.dump(data_convert[index_line], f)
f.write('\n')
f.close()
The result I received in the new file is:
"{'left_w0': -2.6978887076110842, 'left_w1': -1.3257428944152834, 'left_w2': -1.7533400385498048, 'left_e0': 0.03566505327758789, 'left_e1': 0.6948932961 181641, 'left_s0': -1.1665923878540039, 'left_s1': -0.6726505747192383}"
"{'left_w0': -2.6967382220214846, 'left_w1': -0.8440729275695802, 'left_w2': -1.7541070289428713, 'left_e0': 0.036048548474121096, 'left_e1': 0.166820410 49194338, 'left_s0': -0.7731263162109375, 'left_s1': -0.7056311616210938}"
I cannot remove "".
You could simplify your code by removing unnecessary transformations:
import json
def write_data_to_file(data, filename):
with open(filename, 'w') as file:
json.dump(data, file)
def read_data_from_file(filename):
with open(filename) as file:
return json.load(file)

Categories