Speed up parsing of gzipped jsonlines files - python

I have about 5,000 .gzip files (~1MB each). Each of these files contains data in a jsonlines format. Here's what it looks like:
{"category_id":39,"app_id":12731}
{"category_id":45,"app_id":12713}
{"category_id":6014,"app_id":13567}
I want to parse these files and convert them to a pandas dataframe. Is there a way to speed up this process? Here's my code but it's kinda slow (0.5s per file)
import pandas as pd
import jsonlines
import gzip
import os
import io
path = 'data/apps/'
files = os.listdir(path)
result = []
for n, file in enumerate(files):
print(n, file)
with open(f'{path}/{file}', 'rb') as f:
data = f.read()
unzipped_data = gzip.decompress(data)
decoded_data = io.BytesIO(unzipped_data)
reader = jsonlines.Reader(decoded_data)
for line in reader:
if line['category_id'] == 6014:
result.append(line)
df = pd.DataFrame(result)

This should allow you to read each line without loading the whole file.
import pandas as pd
import json
import gzip
import os
path = 'data/apps/'
files = os.listdir(path)
result = []
for n, file in enumerate(files):
print(n, file)
with gzip.open(f'{path}/{file}') as f:
for line in f:
data = json.loads(line)
if data['category_id'] == 6014:
result.append(data)
df = pd.DataFrame(result)

Related

Outputting or reading columns in a different order using pandas/python

I have a Python script that reads a CSV file and outputs it to HTML.
I would like to change the order of the columns once written to the HTML file, but I don't know if I should do this on read or write or what function to use, here is my code so far.
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['OrderNo','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
html_table = csv.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)
Any help, greatly appreciated.
Thanks.
I've been looking at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_html.html, but can't quite find out how to re-order the columns.
Edit,
Changes to get the solution are,
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#python can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['Order No','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
cols = list(csv.columns)
a, b, c, d, e, f, g = cols.index('Order No'), cols.index('Req Qty'), cols.index('Planned Start'), cols.index('Resource'), cols.index('Op'), cols.index('Part'), cols.index('Desc')
cols[a], cols[b], cols[c], cols[d], cols[e], cols[f], cols[g] = cols[a], cols[e], cols[f], cols[g], cols[c], cols[b], cols[d]
df = csv[cols]
html_table = df.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)

How to use elements in list by order

My goal is to change multiple csv files in a folder into JSON.
First, I needed to list my csv files
for file in os.listdir("C:/Users/folder_to_csv"):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
#check if csv files are listed correctly
print(os.path.join("C:/Users/folder_to_csv", filename))
With this, I was able to call csv files in that folder.
Result:
C:/Users/folder_to_csv\file_1.csv C:/Users/folder_to_csv\file_2.csv C:/Users/folder_to_csv\file_3.csv
Then, I wanted to use all of the csv files in 'csvlist' to jsonObj, however for some reason, my codes are only using the first file (C:/Users/folder_to_csv\file_1.csv)
This is what I have tried so far:
import json
import csv
import requests
import threading
import os
for file in os.listdir("C:/Users/folder_to_csv"):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
csvlist = os.path.join("C:/Users/folder_to_csv", filename)
data = {}
def main():
#loop csv list so my codes can read all csv files
length = len(csvlist)
for i in range(length):
i += 1
path = csvlist
#switch csv to json
with open(path, mode='r') as f:
reader = csv.DictReader(f)
processdata = [row for row in reader]
dlist = processdata
jsonObj = json.dumps(dlist)
})
print(jsonObj)
main()
In the initial loop, you keep redefining the csvlist variable. I suppose you want it to be a list? Then just create an initial empty list and append to it instead of redefining
csvlist = []
...
csvlist.append(os.path.join("C:/Users/folder_to_csv", filename))

How to read from a csv file in zip folder and save data from csv file in database?

import glob
import os
import csv
import zipfile
from io import StringIO
for name in glob.glob('C:/Users/RAMESH SANTHA/Downloads/download-NIFTY 50-01012020.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Users/RAMESH SANTHA/Downloads/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv_file = '.'.join([dataFile, 'csv']) #all fixed
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv_file))
reader = csv.reader(data)
for row in reader:
print (row)
I tried following code to read data from zip folder which contains csv file and print rows but got error:
data = StringIO.StringIO(zfile.read(csv_file))
AttributeError: type object '_io.StringIO' has no attribute 'StringIO'
There is no StringIO.StringIO() but io.StringIO()
import io
data = io.StringIO(...)
With your import it will be even without io.
from io import StringIO
data = StringIO(...)
BTW: I think you overcomplicated code using glob and join(). And you can use filename directly with ZipFile without open()
import os
import csv
import zipfile
import io
zip_fullname = 'C:/Users/RAMESH SANTHA/Downloads/download-NIFTY 50-01012020.zip'
zip_file = os.path.basename(zip_fullname)
csv_file = zip_file.replace('.zip', '.csv')
print(zip_file) # download-NIFTY 50-01012020.zip
print(csv_file) # download-NIFTY 50-01012020.csv
zfile = zipfile.ZipFile(zip_fullname)
data = io.StringIO(zfile.read(csv_file).decode('utf-8')) # bytes needs to be converted to string
reader = csv.reader(data)
for row in reader:
print(row)
But with pandas it should be even simpler
import pandas as pd
df = pd.read_csv('C:/Users/RAMESH SANTHA/Downloads/download-NIFTY 50-01012020.zip')
print(df)
Looking at the script you getting error opening the csv file from zip file. Below is python 3 code that I have working for a zip file having few csv's. The directory to extract should exist before you run the script
import zipfile
path_to_zip_file='/tmp/test1.zip' # Assuming this file exist , This path is from mac, but should work for windows as well'
directory_to_extract_to='/tmp/extract/' # Assuming this directory already exist
import csv,os
import codecs
import glob
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
zip_ref.extractall(directory_to_extract_to)
for file in glob.glob(directory_to_extract_to+'*.csv'):
path = os.path.join(directory_to_extract_to,file)
with open(path, 'rb') as f:
reader = csv.reader(codecs.iterdecode(f, 'utf-8'))
# Below code is print them as arrays
# for row in reader:
# print(row)
# Reading rows as ordered dictionary
dictReader = csv.DictReader(codecs.iterdecode(f, 'utf-8'))
for row in dictReader:
print(row)

I have a folder with many .tar.gz files. In python I want to go into each file unzip or compress and find text file that has string I want to extract?

I have main folder with many gz.tar compress files. So I need to unzip twice to get to a data file with text then I am extracting a certain string in the text. I am having trouble unzipping to get to the file with text then move to next file and do the same. Saving the results in a dataframe.
import os
import tarfile
for i in os.listdir(r'\user\project gz'):
tar = (i, "r:gz")
for m in tar.getmembers():
f= tar.extractfile(member):
if f is not None:
content = f.read()
text = re.findall(r"\name\s", content)
df = pd.Dataframe(text)
print(df)
I guess you want to find out file which contains the string \name\s in \user\project gz\*.tar.gz?
A solution is
import os
import re
import tarfile
import pandas as pd
row = []
value = []
for filename in os.listdir(r'\\user\\project gz'):
if filename.endswith('.tar.gz'):
tar = tarfile.open(r'\\user\\project gz' + filename)
for text_file in tar.getmembers():
f = tar.extractfile(text_file)
if f is not None:
content = f.read().decode()
if re.findall(r"\\name\\s", content):
row.append(text_file.name)
value.append(content)
tar.close()
df = pd.DataFrame(value, columns=['nametag'], index=row)
print(df)

How to write csv file in html?

I have read file of csv but I have a problem that how to read CSV file and save it in table.html?
import csv
html_about = ''
names = []
with open('filo.csv') as data_file:
csv_data = csv.reader(data_file)
for line in csv_data:
names.append(f'{line[0]}')
html_output = '\n<ul>'
for name in names:
html_output += f'\n\t<li>{name}</li>'
html_output += '\n</ul>'
from prettytable import PrettyTable
x = PrettyTable(line[0])
html_code = x.get_html_string()
html_file = open('table.html','w')
html_file = html_file.write(html_code)
I suggest you use pandas library,
it has pd.read_csv, and also pd.to_html
usage should look like this, let me know if this works for you:
import pandas as pd
df = pd.read_csv('filo.csv')
with open('table.html', 'w') as html_file:
df.to_html(html_file)

Categories