I have a lot of JSON files, I put them in my folder, I want to convert them to CSV format,
Should I use import glob? ? I am a novice, how can I modify my codeļ¼
#-*-coding:utf-8-*-
import csv
import json
import sys
import codecs
def trans(path):
jsonData = codecs.open('C:/Users/jeri/Desktop/1', '*.json', 'r', 'utf-8')
# csvfile = open(path+'.csv', 'w')
# csvfile = open(path+'.csv', 'wb')
csvfile = open('C:/Users/jeri/Desktop/1.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csvfile, delimiter=',')
flag = True
for line in jsonData:
dic = json.loads(line)
if flag:
keys = list(dic.keys())
print(keys)
flag = False
writer.writerow(list(dic.values()))
jsonData.close()
csvfile.close()
if __name__ == '__main__':
path=str(sys.argv[0])
print(path)
trans(path)
Yes using glob would be a good way to iterate through the .json files in your folder! But glob doesn't have anything to do with the reading/writing of files. After importing glob, you can use it like this:
for curr_file in glob.glob("*.json"):
# Process each file here
I see that you've used the json module to read in your code snippet. I'd say the better way to go about it is to use pandas.
df = pd.read_json()
I say this because with the pandas library, you can simply convert from .json to .csv using
df.to_csv('file_name.csv')
Combining the three together, it would look like this:
for curr_file in glob.glob("*.json"):
# Process each file here
df = pd.read_json(curr_file)
df.to_csv('file_name.csv')
Also, note that if your json has nested objects, it can't be directly converted to csv, you'll have to settle the organization of data prior to the conversion.
Related
I am trying to read a gzip file using pandas.read_csv like so:
import pandas as pd
df = pd.read_csv("data.ZIP.gz", usecols=[*range(0, 39)], encoding="latin1", skipinitialspace=True)
But it throws this error:
ValueError: Passed header names mismatches usecols
However, if I manually extract the zip file from gz file, then read_csv if able to read the data without errors:
df = pd.read_csv("data.ZIP", usecols=[*range(0, 39)], encoding="latin1", skipinitialspace=True)
Since I have to read a lot of these files I don't want to manually extract them. So, how can I fix this error?
You have two levels of compression - gzip and zip - but pandas know how to work with only one level of compression.
You can use module gzip and zipfile with io.BytesIO to extract it to file-like object in memory.
Here minimal working code
It can be useful if zip has many files and you want to select which one to extract
import pandas as pd
import gzip
import zipfile
import io
with gzip.open('data.csv.zip.gz') as f1:
data = f1.read()
file_like_object_1 = io.BytesIO(data)
with zipfile.ZipFile(file_like_object_1) as f2:
#print([x.filename for x in f2.filelist]) # list all filenames
#data = f2.read('data.csv') # extract selected filename
#data = f2.read(f2.filelist[0]) # extract first file
data = f2.read(f2.filelist[0].filename) # extract first file
file_like_object_2 = io.BytesIO(data)
df = pd.read_csv(file_like_object_2)
print(df)
But if zip has only one file then you can use read_csv to extract it - it needs to add option compression='zip' because file-like object has no filename and read_csv can't use filename's extension to recognize compressed file.
import pandas as pd
import gzip
import io
with gzip.open('data.csv.zip.gz') as f1:
data = f1.read()
file_like_object_1 = io.BytesIO(data)
df = pd.read_csv(file_like_object_1, compression='zip')
print(df)
use the gzip module to unzip all your files somethings like this
for file in list_file_names:
file_name=file.replace(".gz","")
with gzip.open(file, 'rb') as f:
file_content = f.read()
with open(file_name,"wb") as r:
r.write(file_content)
You can use zipfile module, such as :
import zipfile
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
zip_ref.extractall(directory_to_extract_to)
I'm new to Python and the task I am performing is to extract a specific key value from a list of .iris ( which contains the list of nested dictionary format) files in a specific directory.
I wanted to extract the specific value and save it as a new .csv file and repeat it for all other files.
Below is my sample of .iris file from which I should extract only for the these keys ('uid','enabled','login','name').
{"streamType":"user",
"uid":17182,
"enabled":true,
"login":"xyz",
"name":"abcdef",
"comment":"",
"authSms":"",
"email":"",
"phone":"",
"location":"",
"extraLdapOu":"",
"mand":997,
"global":{
"userAccount":"View",
"uid":"",
"retention":"No",
"enabled":"",
"messages":"Change"},
"grants":[{"mand":997,"role":1051,"passOnToSubMand":true}],
I am trying to convert the .iris file to .json and reading the files one by, but unfortunately, I am not getting the exact output as desired.
Please, could anyone help me?
My code (added from comments):
import os
import csv
path = ''
os.chdir(path)
# Read iris File
def read_iris_file(file_path):
with open(file_path, 'r') as f:
print(f.read())
# iterate through all files
for file in os.listdir():
# Check whether file is in iris format or not
if file.endswith(".iris"):
file_path = f"{path}\{file}"
# call read iris file function
print(read_iris_file(file_path))
Your files contain data in JSON format, so we can use built-in json module to parse it. To iterate over files with certain extension you can use pathlib.glob() with next pattern "*.iris". Then we can use csv.DictWriter() and pass "ignore" to extrasaction argument which will make DictWriter ignore keys which we don't need and write only those which we passed to fieldnames argument.
Code:
import csv
import json
from pathlib import Path
path = Path(r"path/to/folder")
keys = "uid", "enabled", "login", "name"
with open(path / "result.csv", "w", newline="") as out_f:
writer = csv.DictWriter(out_f, fieldnames=keys, extrasaction='ignore')
writer.writeheader()
for file in path.glob("*.iris"):
with open(file) as inp_f:
data = json.load(inp_f)
writer.writerow(data)
Try the below (the key point here is loading the iris file using ast)
import ast
fields = ('uid','enabled','login','name')
with open('my.iris') as f1:
data = ast.literal_eval(f1.read())
with open('my.csv','w') as f2:
f2.write(','.join(fields) + '\n')
f2.write(','.join(data[f] for f in fields) + '\n')
my.csv
uid,enabled,login,name
17182,true,xyz,abcdef
i'm new to python and I've got a large json file that I need to convert to csv - below is a sample
{ "status": "success","Name": "Theresa May","Location": "87654321","AccountCategory": "Business","AccountType": "Current","TicketNo": "12345-12","AvailableBal": "12775.0400","BookBa": "123475.0400","TotalCredit": "1234567","TotalDebit": "0","Usage": "5","Period": "May 11 2014 to Jul 11 2014","Currency": "GBP","Applicants": "Angel","Signatories": [{"Name": "Not Available","BVB":"Not Available"}],"Details": [{"PTransactionDate":"24-Jul-14","PValueDate":"24-Jul-13","PNarration":"Cash Deposit","PCredit":"0.0000","PDebit":"40003.0000","PBalance":"40003.0000"},{"PTransactionDate":"24-Jul-14","PValueDate":"23-Jul-14","PTest":"Cash Deposit","PCredit":"0.0000","PDebit":"40003.0000","PBalance":"40003.0000"},{"PTransactionDate":"25-Jul-14","PValueDate":"22-Jul-14","PTest":"Cash Deposit","PCredit":"0.0000","PDebit":"40003.0000","PBalance":"40003.0000"},{"PTransactionDate":"25-Jul-14","PValueDate":"21-Jul-14","PTest":"Cash Deposit","PCredit":"0.0000","PDebit":"40003.0000","PBalance":"40003.0000"},{"PTransactionDate":"25-Jul-14","PValueDate":"20-Jul-14","PTest":"Cash Deposit","PCredit":"0.0000","PDebit":"40003.0000","PBalance":"40003.0000"}]}
I need this to show up as
name, status, location, accountcategory, accounttype, availablebal, totalcredit, totaldebit, etc as columns,
with the pcredit, pdebit, pbalance, ptransactiondate, pvaluedate and 'ptest' having new values each row as the JSON file shows
I've managed to put this script below together looking online, but it's showing me an empty csv file at the end. What have I done wrong? I have used the online json to csv converters and it works, however as these are sensitive files I'm hoping to write/manage with my own script so I can see exactly how it works. Please see below for my python script - can I have some advise on what to change? thanks
import csv
import json
infile = open("BankStatementJSON1.json","r")
outfile = open("testing.csv","w")
writer = csv.writer(outfile)
for row in json.loads(infile.read()):
writer.writerow(row)
import csv, json, sys
# if you are not using utf-8 files, remove the next line
sys.setdefaultencoding("UTF-8") # set the encode to utf8
# check if you pass the input file and output file
if sys.argv[1] is not None and sys.argv[2] is not None:
fileInput = sys.argv[1]
fileOutput = sys.argv[2]
inputFile = open("BankStatementJSON1.json","r") # open json file
outputFile = open("testing2.csv","w") # load csv file
data = json.load("BankStatementJSON1.json") # load json content
inputFile.close() # close the input file
output = csv.writer("testing.csv") # create a csv.write
output.writerow(data[0].keys()) # header row
for row in data:
output.writerow(row.values()) # values row
This works for the JSON example you posted. The issue is that you have nested dict and you can't create sub-headers and sub rows for pcredit, pdebit, pbalance, ptransactiondate, pvaluedate and ptest as you want.
You can use csv.DictWriter:
import csv
import json
with open("BankStatementJSON1.json", "r") as inputFile: # open json file
data = json.loads(inputFile.read()) # load json content
with open("testing.csv", "w") as outputFile: # open csv file
output = csv.DictWriter(outputFile, data.keys()) # create a writer
output.writeheader()
output.writerow(data)
Make sure you're closing the output file at the end as well.
I have a doubt about how to do to continue the code, I need to take all files from a folder and merge them in 1 file with another text format.
Example:
The Input files are of text format like this:
"{'nr': '3173391045', 'data': '27/12/2017'}"
"{'nr': '2173391295', 'data': '05/01/2017'}"
"{'nr': '5173351035', 'data': '07/03/2017'}"
The Output files must be lines like this:
"3173391045","27/09/2017"
"2173391295","05/01/2017"
"5173351035","07/03/2017"
This is my working code, it's working for merge and taking out the blank lines
import glob2
import datetime
filenames=glob2.glob("*.txt")
with open(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")+".SAI", 'w') as file:
for filename in filenames:
with open(filename,"r") as f:
file.write(f.read())
I'm trying something with .replace but is not working, I get syntax errors or blank files
filedata = filedata.replace("{", "") for line in filedata
If your input files had contained valid JSON strings, the correct way would have been to parse the lines as JSON and write them back in csv. As strings are enclosed in single quotes (') they are rejected by the json module of the Python library, and my advice is to use a regex to parse them. Code could become:
import glob2
import datetime
import csv
import re
# the regex to parse the line
rx = re.compile(r".*'nr'\s*:\s*'(\d+)'.*'data'\s*:\s*'([/\d]+)'")
filenames=glob2.glob("*.txt")
with open(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")+".SAI", 'w') as file:
wr = csv.writer(file, quoting = csv.QUOTE_ALL)
for filename in filenames:
with open(filename,"r") as f:
for line in f: # process line by line
m = rx.match(line)
wr.writerow(m.groups())
With a few tweaks, the input data can be coerced into a form suitable for JSON parsing:
from datetime import datetime
import json
import glob2
import csv
with open(datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")+".SAI", 'w', newline='') as f_output:
csv_output = csv.writer(f_output, quoting=csv.QUOTE_ALL)
for filename in glob2.glob('*.txt'):
with open(filename) as f_input:
for row in f_input:
row_dict = json.loads(row.strip('"\n').replace("'", '"'))
csv_output.writerow([row_dict['nr'], row_dict['data']])
Giving you:
"3173391045","27/12/2017"
"2173391295","05/01/2017"
"5173351035","07/03/2017"
Note, in Python 3.x the output file should be opened with newline=''. Without this, extra blank lines can appear in the output file.
using regex/replaces to parse those strings is dangerous. You could always stumble on a data containing the delimiter, the comma, etc..
And in this case, even if json cannot read those lines,ast.literal_eval can without any modification whatsoever:
import ast
with open("output.csv",newline="") as fw:
cw = csv.writer(fw)
for filename in filenames:
with open(filename) as f:
for line in f:
d = ast.literal_eval(line)
cw.writerow([d['nr'],d['data'])
I need to read multiple csv files in a zip folder and extract the data from those csv's into a container in Python.
I am new to Python having basic knowledge.So detailed explanation is appreciable.
Thanks in advance
Sampath
The first thing to do is to open the zip file using module zipfile. Then read the CSV data from each archived file and store it in a container such as a dictionary.
The following will read the data from each file in the zip archive into a dictionary keyed by the file name.
import zipfile
container = {}
with zipfile.ZipFile('/path/to/your/zipfile') as zf:
for name in zf.namelist():
container[name] = zf.read(name)
for name in container:
print("Contents of file {}:".format(name))
print(container[name])
print("============================\n")
Optionally you could process the csv data using module csv. Something like this should get you started:
import csv
import zipfile
from cStringIO import StringIO
container = {}
with zipfile.ZipFile('/path/to/your/zipfile') as zf:
for name in zf.namelist():
container[name] = csv.reader(StringIO(zf.read(name)))
Now container is a dictionary keyed by file name with csv.reader objects as values.
Here is how you can read all the text inside zip:
import zipfile
archive = 'c:\\test\\archive.zip'
def readZip(archive):
zfile = zipfile.ZipFile(archive)
for finfo in zfile.infolist():
ifile = zfile.open(finfo)
lines = ifile.readlines()
return lines
print(readZip(archive))
Thanks for the help.
Apart from the code provided above,I have come up with a code which satisfies the question
import os
import csv
from zipfile import ZipFile
#Extracts and loads the files in a zip file to a specified destination
ze = ZipFile(open("Src_AdventureWorks_Files.zip","r"))
ze.extractall("/home/sreddi/workspace/DQAS_Main/Src_AdventureWorks_Files/")
print "Extraction successful"
#Meta data of the zipfile
zf = ZipFile('Src_AdventureWorks_Files.zip', 'r')
zc = zf.namelist()
print zc
#Loop to run each csv file and print the data
if __name__ == "__main__":
i=0
while i < len(zc):
#path = '/home/sreddi/workspace/DQAS_Main/Src_AdventureWorks_Files/'+zc[i]
#print path
print zc[i]
for csv_path in zc:
print "###########"
print zc[i]
print "###########"
os.chdir('/home/sreddi/workspace/DQAS_Main/Src_AdventureWorks_Files')
f = open(zc[i])
csv_f = csv.reader(f)
for row in csv_f:
print row
f.close()
i += 1