How to convert bulk of csv into json - python

Below is the code to convert one csv file into json
List all the files in the dir
[data1.csv, data2.csv,data.csv]
Below is the code to convert one file to json. Like i need to loop all the files
import csv
import json
import os
import glob
os.chdir(r'dir' )
result = glob.glob( '*.csv' )
print (result)
def make_json(csvFilePath, jsonFilePath):
for i in result:
data = {}
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
key = rows['id']
data[key] = rows
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
csvFilePath =f"{i}"
jsonFilePath =f"{i.split('.')[-2]}.json"
make_json(csvFilePath, jsonFilePath)
I have similar kind of files which need to convert into json. in every files id is the primary key
Lot of files are there. i dont want to change the csvFilePath and jsonFilePath everytime.
The issue facing?
My all json files is ending with .csv.json
Last file name data9.csv is not converting to json and data2 is converting twice as data2.json and data2.csv.json

Here is an approach,
import glob
import pandas as pd
for csv_f in glob.glob('/<file_path>/*.csv'):
with open(f'{csv_f.replace(".csv", ".json")}', "w") as f:
pd.read_csv(csv_f).set_index('id') \
.to_json(f, orient='index')

Related

JSON format inside record in csv file - convert to column Python

This is my data. Inside the column - 'device' and 'geonetwork' store the data as a dict or json format. I would like to create new columns based on data from that columns, fore example -> new column should be 'browser','browserversion', 'continent' and so on. I have tried a lot of solutions, but it dosen't work.
enter image description here
DATA
,date,device,fullVisitorId,geoNetwork
0,20180420,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"", ""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Macintosh""}",3.37108036201195E+018,"{""continent"": ""Americas"", ""subContinent"": ""Northern America"", ""country"": ""United States"", ""region"": ""California""}"
1,20180328,"{""browser"": ""Chrome"", ""browserVersion"": ""not available in demo dataset"",
""browserSize"": ""not available in demo dataset"", ""operatingSystem"": ""Macintosh""}",1.27350339266773E+018,"{""continent"": ""Americas"", ""subContinent"": ""Northern America"", ""country"": ""Canada"", ""region"": ""State of Sao Paulo""}"
A little help how to solve my problem
enter image description here
On the attached picture first new column name is highlited. It is json file
import csv
import json
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
# read csv file
with open(csvFilePath, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for row in csvReader:
jsonArray.append(row)
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonString = json.dumps(jsonArray, indent=4)
jsonf.write(jsonString)
csvFilePath = r'dane.csv'
jsonFilePath = r'data.json'
csv_to_json(csvFilePath, jsonFilePath)

How to convert Word Doc with tables to csv and output as JSON

I have the following code to extract tables from a word doc and create a list of csv files from the tables:
from docx import Document
import pandas as pd
import csv
import json
import time
document = Document('pathtoFile')
tables = []
for table in document.tables:
df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
if cell.text:
df[i][j] = cell.text
tables.append(pd.DataFrame(df))
for nr, i in enumerate(tables):
i.to_csv("table_" + str(nr) + ".csv")
I also have the following script to take a csv file and extract it to JSON:
import csv
import json
import time
def csv_to_json(csvFilePath, jsonFilePath):
jsonArray = []
#read csv file
with open(csvFilePath, encoding='utf-8', errors='ignore') as csvf:
#load csv file data using csv library's dictionary reader
csvReader = csv.DictReader(csvf)
#convert each csv row into python dict
for row in csvReader:
#add this python dict to json array
jsonArray.append(row)
#convert python jsonArray to JSON String and write to file
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonString = json.dumps(jsonArray, indent=4)
jsonf.write(jsonString)
csvFilePath = r'pathtoFile'
jsonFilePath = r'pathtoFile'
start = time.perf_counter()
csv_to_json(csvFilePath, jsonFilePath)
finish = time.perf_counter()
print(f"Conversion completed successfully in {finish - start:0.4f} seconds")
The main issue is combining the two and figuring out how to go about taking the word document with the tables, extracting them to csv's, then taking the csv and converting to JSON. I may be overcomplicating this but open to suggestions.

Why is my code not working while converting bulk csv to json?

There are two CSV files. I need to convert to JSON. Code is below
import csv
import json
import os
import glob
os.chdir(r'C:\Users\user\Desktop\test' )
result = glob.glob( '*.csv' )
print (result)
def make_json(csvFile, jsonFile):
csvFile, jsonFile = '',''
for i in result:
data = {}
with open(csvFile, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
key = rows['id']
data[key] = rows
with open(jsonFile, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
csvFilePath =f"{i}"
jsonFilePath =f"{i.split('.')[-2]}.json"
make_json(csvFile, jsonFile)
I got error > csvFile is not mentioned. But the third line from the end mentions the CSV file.
Disclaimer. Please find the error in the code. I already know of the working code which is in pandas
Below is the correct code, but I would recommend you learn to use the python debugger so you can resolve any logic flaws in your code next time. Documentation on the python debugger can be found here:
https://docs.python.org/3/library/pdb.html
Your code was structured in a way that meant for each csv file, you were not setting the file name until after you attempted to open it. The immediate error you saw was caused because you tried to call make_json() before you defined the values for csvFile and jsonFile.
I would recommend changing the code to:
import csv
import json
import glob
def make_json(csvList):
for csvFile in csvList:
data = {}
with open(csvFile, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
key = rows['id']
data[key] = rows
jsonFile =f"{csvFile.split('.')[-2]}.json"
with open(jsonFile, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
make_json(glob.glob('*.csv'))
You should try this
import csv, json, os, glob
os.chdir(r'C:\Users\user\Desktop\test' )
result = glob.glob( '*.csv' )
print(result)
def make_json():
for i in result:
with open(i, encoding='utf-8') as csvf:
data = [row for row in csv.DictReader(csvf)]
with open(f"{i.split('.')[-2]}.json", 'w', encoding='utf-8') as jsonf:
json.dump(data, jsonf)
make_json()
You did not initialize both the arguments of make_json() - (csvFilePath & jsonFilePath)

How to read from a csv file in zip folder and save data from csv file in database?

import glob
import os
import csv
import zipfile
from io import StringIO
for name in glob.glob('C:/Users/RAMESH SANTHA/Downloads/download-NIFTY 50-01012020.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Users/RAMESH SANTHA/Downloads/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv_file = '.'.join([dataFile, 'csv']) #all fixed
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv_file))
reader = csv.reader(data)
for row in reader:
print (row)
I tried following code to read data from zip folder which contains csv file and print rows but got error:
data = StringIO.StringIO(zfile.read(csv_file))
AttributeError: type object '_io.StringIO' has no attribute 'StringIO'
There is no StringIO.StringIO() but io.StringIO()
import io
data = io.StringIO(...)
With your import it will be even without io.
from io import StringIO
data = StringIO(...)
BTW: I think you overcomplicated code using glob and join(). And you can use filename directly with ZipFile without open()
import os
import csv
import zipfile
import io
zip_fullname = 'C:/Users/RAMESH SANTHA/Downloads/download-NIFTY 50-01012020.zip'
zip_file = os.path.basename(zip_fullname)
csv_file = zip_file.replace('.zip', '.csv')
print(zip_file) # download-NIFTY 50-01012020.zip
print(csv_file) # download-NIFTY 50-01012020.csv
zfile = zipfile.ZipFile(zip_fullname)
data = io.StringIO(zfile.read(csv_file).decode('utf-8')) # bytes needs to be converted to string
reader = csv.reader(data)
for row in reader:
print(row)
But with pandas it should be even simpler
import pandas as pd
df = pd.read_csv('C:/Users/RAMESH SANTHA/Downloads/download-NIFTY 50-01012020.zip')
print(df)
Looking at the script you getting error opening the csv file from zip file. Below is python 3 code that I have working for a zip file having few csv's. The directory to extract should exist before you run the script
import zipfile
path_to_zip_file='/tmp/test1.zip' # Assuming this file exist , This path is from mac, but should work for windows as well'
directory_to_extract_to='/tmp/extract/' # Assuming this directory already exist
import csv,os
import codecs
import glob
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
zip_ref.extractall(directory_to_extract_to)
for file in glob.glob(directory_to_extract_to+'*.csv'):
path = os.path.join(directory_to_extract_to,file)
with open(path, 'rb') as f:
reader = csv.reader(codecs.iterdecode(f, 'utf-8'))
# Below code is print them as arrays
# for row in reader:
# print(row)
# Reading rows as ordered dictionary
dictReader = csv.DictReader(codecs.iterdecode(f, 'utf-8'))
for row in dictReader:
print(row)

Python CSV writer - writing columns in new csv file up to maximum number of fields in csv files

I have 200 CSV files in my folder.
What I am trying to do is read first row of each files and write in new csv.
And on top, I want to write [file,field1,field2,...fieldn]
n is maximum number of fields.
import csv
import glob
list=[]
hel=[]
files=glob.glob('C:/dataset/*.csv')
with open('test.csv', 'w',newline='') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
file=file[file.rfind('\\')+1:]
file=file.strip('.csv')
reader = csv.reader(infile)
headers = next(reader)
hel.append((len(headers)))
max(hel)
lst = [file] + headers
csv_writer.writerow(lst)
It came out that maximum number of fields of 200 files are 255.
So on top of new csv file, I want to write file, field1, field2 ... field 255.
How can I do this?
import csv
import glob
list=[]
hel=[]
files=glob.glob('C:/dataset/*.csv')
with open('test.csv', 'w',newline='') as testfile:
csv_writer = csv.writer(testfile)
for file in files:
with open(file, 'r') as infile:
file=file[file.rfind('\\')+1:]
file=file.strip('.csv')
reader = csv.reader(infile)
headers = next(reader)
hel.append((len(headers)))
b=['field{}'.format(i) for i in range(1,max(hel)+1)]
lst = [file] + headers
csv_writer.writerow(lst)
Now b is list that looks like this ['field1','field2'...'field255']
I need to insert 'file' before 'field1' and write that row on the top of new csv file. Writing code after csv_writer.writerow(lst) gives me csv file with 'field1','field2'.. every other line. How can I fix this problem
You first need to read all your input files to determine the maximum number of fields is 255. Then you need to construct a list of field names to write into the output file (just once, not in a loop):
['field{}'.format(i) for i in range(1, 256)]
You can pass that list to the csv module to write it.
Read the field count and first line from each file before writing the file.
import glob
from itertools import chain
import os
from os.path import splitext, basename
def first_line(filepath):
with open(filepath) as f:
return next(f)
def write_test_file(dest_file_path, source_path_name):
source_paths = glob.glob(source_path_name)
first_lines = list(map(first_line, source_paths))
max_count = max(l.count(",") for l in first_lines)
field_names = map("field{}".format, range(1, max_count + 2))
header = ",".join(chain(["file"], field_names)) + os.linesep
file_names = (splitext(basename(p))[0] for p in source_paths)
content = chain([header], map(",".join, zip(file_names, first_lines)))
with open(dest_file_path, 'w') as testfile:
testfile.write("".join(content))
write_test_file('test.csv', 'C:/dataset/*.csv')

Categories