Now I don't have any problems about converting this csv or downloading it I have a problem saving it to django model at filefield
The minmized sample code:
def download_convert_reports_s3_temp():
def get_report_url():
bucket_name = 'temp_bucket'
conn = boto.connect_s3(AWS_ACCESS_KEY_ID,
AWS_SECRET_ACCESS_KEY)
bucket = conn.get_bucket(bucket_name)
key = bucket.get_key('TEMP_2017-01-10.csv')
return key.generate_url(expires_in=600)
def get_doc():
return Doc.objects.get(owner=User.objects.first())
def get_file(file):
file_temp = NamedTemporaryFile(delete=True)
file_temp.write(file.content)
file_temp.flush()
return File(file_temp)
def convert_csv_to_xlsx():
request = requests.get(get_report_url())
csvfile = get_file(request)
from django.conf import settings
excelFile = xlsxwriter.Workbook('report.xlsx', {
'strings_to_numbers': True,
'default_date_format': 'yy/mm/dd',
'tmpdir': settings.MEDIA_ROOT +
'/documents/%s'.format(file.name.rsplit('.')[0] + '.xlsx')
}
excelFile = get_doc().file
worksheet = excelFile.add_worksheet()
worksheet.write('A1', 'data')
worksheet.write('B1', 'data')
worksheet.write('C1', 'data')
worksheet.write('D1', 'data')
worksheet.write('E1', 'data')
# Start from the first cell. Rows and columns are zero indexed.
row = 1
col = 0
with open(csvfile, 'rb') as f:
content = csv.reader(f)
# Iterate over the data and write it out row by row.
for row_data in content:
for data in row_data:
worksheet.write(row, col, data)
col += 1
row += 1
col = 0
f.close()
excelFile.close()
return convert_csv_to_xlsx()
Now the problem that I really don't know to to save this excel file to the doc.file,
and I tried django fieldfile save
---> 19 read = property(lambda self: self.file.
AttributeError: 'Workbook' object has no attribute 'read'
Any suggestion Thanks
def convert_csv_to_xlsx():
csvfile = get_file()
from django.conf import settings
excelFile = xlsxwriter.Workbook(filename=settings.MEDIA_ROOT + '/documents%s' % (
csvfile.name.rsplit('.')[0] + '.xlsx'))
bold = excelFile.add_format({'bold': 1, 'align': 'left', 'bg_color': 'red', 'color': 'white'})
worksheet = excelFile.add_worksheet()
worksheet.set_column(0, 4, width=15)
worksheet.write('A1', 'Sender MSISDN', bold)
worksheet.write('B1', 'Reciever MSISDN', bold)
worksheet.write('C1', 'Amount', bold)
worksheet.write('D1', 'Transaction ID', bold)
worksheet.write('E1', 'Datetime', bold)
# Start from the first cell. Rows and columns are zero indexed.
row = 1
col = 0
# Iterate over the data and write it out row by row.
for row_data in csv.reader(csvfile):
for idx, data in enumerate(row_data):
if idx == 0:
worksheet.write(row, col, data)
elif idx == 1:
worksheet.write(row, col, data)
elif idx == 2:
worksheet.write(row, col, data)
elif idx == 3:
worksheet.write(row, col, data)
elif idx == 4:
worksheet.write(row, col, data)
col += 1
row += 1
col = 0
csvfile.close()
doc = get_doc()
now = datetime.now()
excelFile.close()
doc.file.save(
name='RECHARGE_%d-%s-%s.xlsx' % (now.year,
validate_date(now.month),
validate_date(now.day)
),
content=File(open(settings.MEDIA_ROOT + '/documents%s' % (
csvfile.name.rsplit('.')[0] + '.xlsx',)))
)
os.remove(settings.MEDIA_ROOT + '/documents%s' % (
csvfile.name.rsplit('.')[0] + '.xlsx'), )
Related
I'm attempting to create a program currently that can read a csv, determine if a substring is included in one of the columns of each row, and if it isn't present, rewrites certain columns to a new csv. I have the code down for this much- but the csv I need to use the program for has well over 3 million rows. I use PyCharm and currently I'm not able to process this much data. It can only view the csv in a read-only format which doesn't allow me to use it. I know pandas has a chunk size feature but I don't know how to implement this with the rest of my code.
def reading(csv_input):
originalLength = 0
rowCount = 0
with open(f'Web Report {csv_input}', 'w') as file:
writer = csv.writer(file)
writer.writerow(['Index', 'URL Category', 'User IP', 'URL'])
dropCount = 0
data = pd.read_csv(csv_input, chunksize=100000)
df = pd.DataFrame(data,
columns=['Line', 'Date', 'Hour', 'User Name', 'User IP', 'Site Name',
'URL Category', 'Action', 'Action Description'])
originalLength = len(df.index)
for line in range(originalLength):
dataLine = df.loc[line]
x = dataLine.get(key='Action')
if x == 0:
siteName = dataLine.get(key='Site Name')
if 'dbk' in siteName:
dropCount = dropCount + 1
elif 'ptc' in siteName:
dropCount = dropCount + 1
elif 'wcf' in siteName:
dropCount = dropCount + 1
elif 'google' in siteName:
dropCount = dropCount + 1
else:
writer.writerow([line, # Original Index
df.loc[line].get(key='URL Category'), # Original URL Category
df.loc[line].get(key='User IP'), # Original User IP
df.loc[line].get(key='Site Name')]) # Original Site Name
rowCount = rowCount + 1
else:
dropCount = dropCount + 1
file.close()
print("Input: " + str(csv_input))
print("Output: " + str(file.name))
print("Original Length: " + str(originalLength))
print("Current Length: " + str(rowCount))
print("Drop Count: " + str(dropCount) + "\n")
return df
If you use csv to write file then you could use it also to read row by row.
import csv
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader: # read row by row
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
If you want to use pandas with chunk then you should use for-loop for this.
And when you write with pandas then you need append mode without headers.
import pandas as pd
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create new file with headers
df.to_csv('output.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output.csv', mode='a', header=False)
Minimal working code
import pandas as pd
import csv
# --- create some data ---
data = {
'A': range(0,10),
'B': range(10,20),
'C': range(20,30),
} # columns
df = pd.DataFrame(data)
df.to_csv('input.csv', index=False)
# --- read and write with `pandas` ---
first = True
for df in pd.read_csv('input.csv', chunksize=1): # read row by row
# keep only rows with even index
if df.index % 2 == 0:
print('--- row ---')
print(df)
if first:
# create empty with headers
df.to_csv('output_pandas.csv', mode='w')
first = False
else:
# append to existing file without headers
df.to_csv('output_pandas.csv', mode='a', header=False)
# --- read and write with `csv` ---
with open('input.csv') as infile, open('output.csv', 'w') as outfile:
csv_reader = csv.reader(infile)
csv_writer = csv.writer(outfile)
# copy headers
headers = next(csv_reader)
csv_writer.writerow(headers)
# process rows
for row in csv_reader:
# keep only rows with even index
if int(row[0]) % 2 == 0:
print('--- row ---')
print(row)
csv_writer.writerow(row)
Doc: read_csv(), to_csv()
I am looping through csv files to append to a DataFrame table but it seems that every time I loop and append, there is an index column added to the Table. Very confusing and I am very stuck, any help would be great.
My code:
import sqlite3 as sql
import pandas as pd
import hashlib
import os
import csv
from pandas import ExcelWriter
def obtain_data(filename, connect, type):
writer =
ExcelWriter('path\\new_excel_sheets\\'+filename+'.xlsx')
table =
ExcelWriter('path\\new_excel_sheets\\hash_table.xlsx')
if type == True:
print(filename)
df = pd.DataFrame.from_csv('path'+filename,
index_col=None)
else:
workbook = pd.ExcelFile('path' + filename)
df = workbook.parse('Sheet1')
df = df.rename(columns={'INDEX': 'INDX'})
df = df.rename(columns={'Index': 'INDXS'})
headers = df.dtypes.index
header_list = str(headers.tolist())
header_list = ''.join(header_list)
hash_t = str(hashlib.md5(header_list.encode('utf-8')).hexdigest())
c = connect.cursor()
print(filename)
print(hash_t)
if hash_t == 'd22db04a2f009f222da57e91acdce21b':
next_open = df['DATE'][1]
next_open_value = df['DATE'][2]
df.insert(3, next_open, next_open_value)
headers = df.dtypes.index
header_list = str(headers.tolist())
header_list = ''.join(header_list)
new_hash_t = str(hashlib.md5(header_list.encode('utf-
8')).hexdigest())
df = df.drop(df.index[1:])
hashing = {str(new_hash_t): str(filename)}
df2 = pd.DataFrame.from_dict(hashing, orient='index')
try:
df2.to_sql(name='Hash Table', con=connect, if_exists='append')
df.to_sql(name=new_hash_t, con=connect, if_exists='append')
except:
raise IndexError('Could not transform ' + str(filename) + ' into
database.')
elif hash_t == '484fbe4de83acb41480dd935d82d7fbe':
next_open = df['DATE'][1]
next_open_value = df['DATE'][2]
df.insert(3, next_open, next_open_value)
headers = df.dtypes.index
header_list = str(headers.tolist())
header_list = ''.join(header_list)
new_hash_t = str(hashlib.md5(header_list.encode('utf-
8')).hexdigest())
df = df.drop(df.index[2])
df['DATE'][1] = df['DATE'][0]
hashing = {new_hash_t: filename}
df2 = pd.DataFrame.from_dict(hashing, orient='index')
try:
df2.to_sql(name='Hash Table', con=connect, if_exists='append')
df.to_sql(name=new_hash_t, con=connect, if_exists='append')
except:
raise IndexError('Could not transform ' + str(filename) + ' into
database.')
else:
hashing = {hash_t: filename}
df2 = pd.DataFrame.from_dict(hashing, orient='index')
try:
df2.to_sql(name='Hash Table', con=connect, if_exists='append',
index=False)
df.to_sql(name=hash_t, con=connect, if_exists='append',
index=True)
except:
raise IndexError('Could not transform ' + str(filename) + '
into database.')
df.to_excel(writer)
print(filename + ' has been completed succesfully.')
final_results = {'df': df, 'hash_t': hash_t}
return final_results
csv_files = []
usable_files = []
for filename in os.listdir(filepath):
if filename.endswith(".xlsx"):
print('Found an XLSX file ' + str(filename))
usable_files.append(filename)
elif filename.endswith('.CSV'):
print('Found a CSV File ' + filename)
csv_files.append(filename)
else:
print('Found an unusable file ' + str(filename))
for file in usable_files:
connect = sql.connect(SQLite3 connection)
obtain_data(file, connect, False)
for file in csv_files:
connect = sql.connect(SQLite3 connection)
obtain_data(file, connect, True)
print('All files have been made into Tables')
The SQLite3 database does everything right, but when I append to it it adds an index column. I am not sure how to put index columns in here(feel free to teach me) so bear with me here. The table goes from looking like this
rowid, 0 , 1, 2, etc
0, value, value, value, etc
1, value, value, value, etc
but when I loop through(say 4 times), it changes to this
rowid, index, 0, 1, 2, etc
0, 0, 0, 0, 0, value
0, 0, 0, 0, 0, value
This is a very weird problem so any help would be appreciated, thanks!
Simply set index parameter to False in all to_sql() calls (by default parameter is set to True):
df2.to_sql(name='Hash Table', con=connect, if_exists='append', index=False)
And any flat file outputs:
df.to_excel(writer, index=False)
df.to_csv(filename, index=False)
I'm reading the data from one file named SPD_file. Matching the data with another file named Custom. And all the records which are matching in both the files will be written into the third file.
But it seems that something is wrong, because the code is matching the records and printing on console. But when I'm writing into another file nothing is coming into the new file, other than the header.
workbook = xlrd.open_workbook(SPD_file)
worksheets = workbook.sheet_names()
mapping_records = {}
for worksheet_name in worksheets:
worksheet = workbook.sheet_by_name(worksheet_name)
mapping_record = MappingRecord()
if worksheet_name == "CD":
for curr_row in range(0,worksheet.nrows):
mapping_record = worksheet.row(curr_row)
print worksheet_name
print mapping_record[0].value
for curr_row in mapping_record:
#print "In Loop...."
spd_record = MappingRecord()
spd_record.id = "00002269"
spd_record.erocode = None
spd_record.scno = None
mapping_records[mapping_record[8]] = spd_record
print "Read SPD File....."
custom_file_name = "Custom_" + today.strftime('%Y-%m-%d') + ".csv"
custom_file = ops_home + path + "\\" + custom_file_name
custom = open(custom_file, 'rb')
reader = csv.reader(custom, delimiter=',', quotechar='"')
for line in reader:
if mapping_records.has_key(mapping_record[8]):
spd_record = mapping_records[mapping_record[8]]
if line[7] == "ERO Code":
spd_record.erocode = line[8]
elif line[7] == "Service Number":
spd_record.scno = line[8]
#create a new file.
New_file = ops_home + '\\Reports\\SPD_new_' + today.strftime('%d%m%Y') + '.xlsx'
workbook = xlsxwriter.Workbook(New_file)
# Add a bold format to use to highlight cells.
bold = workbook.add_format({'bold': 1})
money = workbook.add_format({'num_format': '#,##0.00'})
worksheetCd = workbook.add_worksheet("CD")
cdHeader = ("Merchant ID", "EroCode", "Service Number")
cd_row = 0
cd_col = 0
for columnHeader in cdHeader:
worksheetCd.write(cd_row, cd_col, columnHeader,bold)
cd_col += 1
for ctx in mapping_records:
spd_record = mapping_records[ctx]
if spd_record.payment_mode == "CRD":
cd_row += 1
cd_col = 0
cdRow = (spd_record.id, spd_record.erocode, spd_record.scno)
for columnData in cdRow:
if cd_col == 5 or cd_col == 19 or cd_col ==20 or cd_col ==21:
worksheetCd.write_number(cd_row, cd_col, columnData, money)
else:
worksheetCd.write(cd_row, cd_col, columnData)
cd_col += 1
workbook.close()
I am trying to write some output to csv from my code below. First column should have all of the valid IDs with a header that says “Valid (count in parenthesis)”. The second column should contain a list of all of the non-valid IDs and have a header that says “Non-valid (count in parenthesis)”. Any idea how I do this?
import csv
# csv_path = r'C:\temp\data\fileA'
csv_path = r'C:\temp\data\fileA'
reader = csv.reader(open(csv_path, 'r'), dialect='excel-tab')
reader.next() # ignore heading
min_id = 1503332138
max_id = 1503632138
valid_ids = []
invalid = []
x = 0
for line in reader:
pv = line[1]
if id.isdigit() and int(id) >= min_id and int(id) <= max_id:
if id not in valid_ids:
valid_ids.append(id)
else:
if id not in invalid:
invalid.append(id)
print 'Valid IDs (',len(valid_ids),')'
for valid in valid_ids:
print valid
print 'Invalid IDs (',len(invalid),')'
for invalid in invalid:
print invalid
# ...
# Continuing from point where you have valid_ids and invalid lists populated
data = [('Valid IDs', valid_ids), ('Invalid IDs', invalid)]
# Create header
header = []
for (label, id_list) in data:
label_with_count = '%s (%d)' % (label, len(id_list))
header.append(label_with_count)
# Write to CSV file
with open('path_to_output_file.csv') as out_csv_file:
csv_writer = csv.writer(out_csv_file)
csv_writer.writerow(header)
for (idx, dataset) in enumerate(data):
(label, id_list) = dataset
for id in id_list:
row = (idx * ['']) + [id] + ((len(data) - idx - 1) * [''])
csv_writer.writerow(row)
So, I have a QTableWidget that I want to save it to an .xls file using the xlwt module...
Here's the code:
def savefile(self):
filename = unicode(QtGui.QFileDialog.getSaveFileName(self, 'Save File', '', ".xls(*.xls)"))
wbk = xlwt.Workbook()
self.sheet = wbk.add_sheet("sheet")
self.row = 0
self.col = 0
self.add2(self.row, self.col)
wbk.save(filename)
def add2(self, row, col):
for i in range(self.tableWidget.columnCount()):
for x in range(self.tableWidget.rowCount()):
try:
teext = str(self.tableWidget.item(row, col).text())
self.sheet.write(row, col, teext)
row += 1
except AttributeError:
pass
col += 1
But that writes out only the text from cell 0,0 and nothing else...
I think that I have made some serious mistake...
Update:
def savefile(self):
filename = unicode(QtGui.QFileDialog.getSaveFileName(self, 'Save File', '', ".xls(*.xls)"))
wbk = xlwt.Workbook()
self.sheet = wbk.add_sheet("sheet", cell_overwrite_ok=True)
self.add2()
wbk.save(filename)
def add2(self):
row = 0
col = 0
for i in range(self.tableWidget.columnCount()):
for x in range(self.tableWidget.rowCount()):
try:
teext = str(self.tableWidget.item(row, col).text())
self.sheet.write(row, col, teext)
row += 1
except AttributeError:
row += 1
row = 0
col += 1
Solved the problem...
You might also find it more concise and easier to use the output of the range (or xrange) as the indexes for your tableWidget.item call rather than worrying about incrementing your own counters. You might be using the sheet itself in other places in code, but if you're not, it would save you some memory to not assign the sheet to be an attribute variable of your class:
def savefile(self):
filename = unicode(QtGui.QFileDialog.getSaveFileName(self, 'Save File', '', ".xls(*.xls)"))
wbk = xlwt.Workbook()
sheet = wbk.add_sheet("sheet", cell_overwrite_ok=True)
self.add2(sheet)
wbk.save(filename)
def add2(self, sheet):
for currentColumn in range(self.tableWidget.columnCount()):
for currentRow in range(self.tableWidget.rowCount()):
try:
teext = str(self.tableWidget.item(currentRow, currentColumn).text()
sheet.write(currentRow, currentColumn, teext)
except AttributeError:
pass
Because you are using the range command, the currentColumn variable will increment from 0 to columnCount() and currentRow will increment from 0 to currentRow()