import os
import pandas as pd
sheet_id = '1kUvTVkvMJOJ6Mqo7h3F_acUMsErulZvpMGTyWQxeaQM'
df = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv")
print(df)
root = f'D:/Project/Zee/SRGMP/Production/3D'
with open(r'D:\project\Copy of rk', 'r') as file:
csvfile = csv.DictReader(file)
for row in csvfile:
set_name = row['set_name']
asset_name = row['assest_name']
fixed_internal_folder = ['wip', 'export']
for folder in fixed_internal_folder:
asset_root = f'{root}/{set_name}/asset/{asset_name}/{folder}/'
os.makedirs(asset_root)
print(asset_root)
if not os.path.exists(asset_root):
os.makedirs(asset_root)
else:
os.removedirs(asset_root)
Related
I have multiple CSV files to be imported into multiple worksheets named in the same as the CSV file.
However, I have difficulties in creating/appending multiple worksheets.
If I use ExcelWriter(pathDestination, mode = 'a'), FileNotFoundError happens.
If I use ExcelWriter(pathDestination), then only the last CSV file will be created in the worksheet.
How shall I improve the code without the need of listing down each csvpath when doing the to_excel?
import openpyxl
import numpy as np
import pandas as pd
import os
pathDestination = 'Downloads/TemplateOne.xlsx'
csvpathI = '2019_27101220_Export.csv'
csvpathII = '2019_27101220_Import.csv'
csvpathIII = '2020_27101220_Export.csv'
csvpathIV = '2020_27101220_Import.csv'
csvpath_list = [csvpathI, csvpathII, csvpathIII, csvpathIV]
for csvpath in csvpath_list:
df = pd.read_csv(csvpath)
conversion_unit = 1000
supplymentry_conversion_unit = 1000
df['quantity_converted'] = np.multiply(df['Quantity'],conversion_unit)
df['supplimentry_quantity_converted'] = np.multiply(df['Supplimentary Quantity'],conversion_unit)
csvnames = os.path.basename(csvpath).split(".")[0]
with pd.ExcelWriter(pathDestination) as writer:
df.to_excel(writer, sheet_name = csvnames, index = False)`
You need to put the loop inside the context manager in order to save each (df) to a separate sheet.
Try this :
conversion_unit = 1000
supplymentry_conversion_unit = 1000
with pd.ExcelWriter(pathDestination) as writer:
for csvpath in csvpath_list:
df = pd.read_csv(csvpath)
df['quantity_converted'] = df['Quantity'].mul(conversion_unit)
df['supplimentry_quantity_converted'] = df['Supplimentary Quantity'].mul(supplymentry_conversion_unit)
df.to_excel(writer, sheet_name = csvpath.split(".")[0], index = False)
I want to split csv file into 2 lists using column name
CSV file:
Molecule Name,SMILES
ZINC53 (Aspirin),CC(=O)Oc1ccccc1C(=O)O
ZINC7460 (Vatalanib),Clc1ccc(Nc2nnc(Cc3ccncc3)c3ccccc23)cc1
ZINC1493878 (Sorafenib),CNC(=O)c1cc(Oc2ccc(NC(=O)Nc3ccc(Cl)c(C(F)(F)F)c3)cc2)ccn1
Code:
namelist = list()
smileslist = list()
with open('./file.csv', 'r') as f:
f = csv.reader(f, delimiter=',')
columns = next(f)
type_col1 = columns.index("Molecule Name")
type_col2 = columns.index("SMILES")
for column in f:
if type_col1 == 'Molecule Name':
namelist.append(column)
elif type_col2 == 'SMILES':
smileslist.append(column)
With pandas library you can do it as easily as :
import pandas as pd
df = pd.read_csv("./file.csv")
namelist = df["Molecule Name"].tolist()
smileslist = df["SMILES"].tolist()
print(namelist)
print(smileslist)
Or if you prefer using the csv reader you can do it as follow :
import csv
namelist = list()
smileslist = list()
with open("./file.csv", "r") as f:
f = csv.reader(f, delimiter=',')
columns = next(f)
index_col1 = columns.index("Molecule Name")
index_col2 = columns.index("SMILES")
for column in f:
namelist.append(column[index_col1])
smileslist.append(column[index_col2])
A csv file has 90 million rows. One of the Columns in named "State". It has 12 unique values at present. (The count of unique values in the "State" column is dynamic and can change with each csv file.)
I want to split the DataFrame into smaller chunks and then save State-wise files.
The code below is not working.
source_path = "DataJune.txt"
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=1000000)):
dfs = dict(tuple(chunk.groupby('State')))
for i, df in dfs.items():
df = df.append(df)
df.to_csv("tempcsv/" + i +".csv",sep=",", index = False)
IIUC, Try:
source_path = "DataJune.txt"
from collections import defaultdict
def def_value():
return pd.DataFrame()
# Defining the dict
d = defaultdict(def_value)
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
d[state]=d[state].append(chunk[chunk['State']==state])
for i, df in d.items():
df.to_csv("tempcsv/" + str(i) +".csv",sep=",", index = False)
Another version, based on the #Corralien comment:
source_path = "DataJune.txt"
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
with open("tempcsv/" + str(state) +".csv",mode='a+') as file:
for i, row in chunk[chunk['State']==state].iterrows():
file.write(','.join([str(x) for x in row]))
file.write('\n')
Another version:
source_path = "DataJune.txt"
from os.path import exists
import csv
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
path = "tempcsv/" + str(state) +".csv"
file_exists = exists(path)
if not file_exists:
with open(path,newline='',mode='a+') as file:
writer = csv.writer(file)
writer.writerow(chunk.columns)
print(chunk.columns)
with open(path,newline='',mode='a+') as file:
writer = csv.writer(file)
writer.writerows(chunk[chunk['State']==state].values)
You can use:
import pandas as pd
import os
source_path = 'DataJune.txt'
fps = {}
for chunk in pd.read_csv(source_path, sep='|', chunksize=1000000, dtype=object):
for state, df in chunk.groupby('State'):
# New state, create a new file and write headers
if state not in fps:
fps[state] = open(f'tempcsv/{state}.csv', 'w')
fps[state].write(f"{','.join(df.columns)}{os.linesep}")
# Write data without headers
df.to_csv(fps[state], index=False, header=False)
# Close files properly
for fp in fps.values():
fp.close()
del fps
Update
Try to replace:
# Write data without headers
df.to_csv(fps[state], index=False, header=False)
By
# Write data without headers
g = (row.strip() for row in df.to_csv(index=False, header=None, sep=',').split(os.linesep) if row)
print(*g, sep=os.linesep, file=fps[state])
Here's my code:
import glob
import itertools
import sys, os
import six
import csv
import numpy as np
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
os.chdir("PATH/pdf")
extension = 'pdf'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
valeur = []
n = 1
for i in all_filenames:
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
valeur.append(values)
n = n+1
with open('test.csv','wb') as f:
for i in valeur:
f.write(i)
The goal here is to pick up some informations in PDF. Here's the output :
As you can see, the format is not pretty. I'm not very familiar with open() so I'm kind of stuck.
I would like to have distinct rows for each PDF with each informations having her own cell. Something like that :
Try to store the data from each pdf file in a separate list. And add this list to the valeur list which you have.
Use csv module as #martineau rightly suggested.
You can try the with below code.
import csv
valeur = []
#your code
n = 1
for i in all_filenames:
temp_list = []
fp = open(i, "rb")
parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog["AcroForm"])["Fields"]
for i in fields:
field = resolve1(i)
name, value = field.get("T"), field.get("V")
filehehe = "{0}:{1}".format(name,value)
values = resolve1(value)
names = resolve1(name)
temp_list.append(values)
n = n+1
valeur.append(temp_list)
#Finally when you have the required data, you can write to csv file like this.
with open('mycsv.csv', 'w', newline='') as myfile:
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
for val in valeur:
wr.writerow(val)
With this, the output would be like this
I can able to convert xlsx to csv in the case of single excel sheet.
How can i do the same in the case of multiple sheet in single excel file?
I have tried:
workBook = xlrd.open_workbook(filePath)
sheet_names = workBook.sheet_names()
lenth = len(sheet_names)
for i in range(0,lenth):
sheet = workBook.sheet_by_name(sheet_names[i])
yourcsvFile = open(csvPath, 'wb')
wr = csv.writer(yourcsvFile, quoting=csv.QUOTE_ALL)
for rownum in xrange(sheet.nrows):
wr.writerow(sheet.row_values(rownum))
yourcsvFile.close()
Try this
import sys
import xlrd
import csv
filePath = sys.argv[1] # user input file
csvPath = sys.argv[2]
workBook = xlrd.open_workbook(filePath)
sheet_names = workBook.sheet_names()
list_sheet = []
lenth = len(sheet_names)
for i in range(0,lenth):
sheet = workBook.sheet_by_name(sheet_names[i])
list_sheet.append(sheet)
yourcsvFile = open(csvPath, 'wb')
wr = csv.writer(yourcsvFile, quoting=csv.QUOTE_ALL)
total_row = list_sheet[0].ncols
for k in xrange(0,1):
for rownum in xrange(list_sheet[k].nrows):
wr.writerow(list_sheet[k].row_values(rownum))
if len(sheet_names) > 1:
for k in xrange(1,len(list_sheet)):
if list_sheet[k].ncols != total_row:
continue
for rownum in xrange(1,list_sheet[k].nrows):
wr.writerow(list_sheet[k].row_values(rownum))
yourcsvFile.close()