Add CR & LF in all lines python - python

first of all, thanks to read my post. I hope you guys can help me, I'm really new in Python, sorry maybe the answer is really easy.
I read several posts to add [CR][LN] in all lines but the main issue I have in my script ( I don't create that ), is the need to integrate [CR][LN] in all the lines.
At the moment the script only adds [LN] but not the [CR]. The script goes to SQL to extract some tables, convert the information to CSV ( at this moment the information maintains [CR][LN] ), and after that convert to JSON ( in this step only give me the [LN].
import pyodbc
import fileinput
import csv
import pandas as pd
import json
import os
import sys
conn = pyodbc.connect('Driver={SQL Server};'
'Server=TEST;'
'UID=test;'
'PWD=12345;'
'Database=TEST;'
'Trusted_Connection=no;')
cursor = conn.cursor()
query = "SELECT * FROM placeholder"
with open(r"D:\Test.txt") as file:
lines = file.readlines()
print(lines)
for user_input in lines:
result = query.replace("placeholder", user_input)
print(result)
sql_query = pd.read_sql(result,conn)
df = pd.DataFrame(sql_query)
user_inputs = user_input.strip("\n")
filename = os.path.join('D:\\', user_inputs + '.csv')
df.to_csv (filename, index = False, encoding='utf-8', sep = '~', quotechar = "`", quoting=csv.QUOTE_ALL)
print(filename)
filename_json = os.path.join('D:\\', user_inputs + '.jsonl')
csvFilePath = (filename)
jsonFilePath = (filename_json)
print(filename_json)
df_o = df.astype(str)
df_o = df_o.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df_o.to_json(filename_json, orient = "records", lines = bool, date_format = "iso", double_precision = 15, force_ascii = False, date_unit = 'ms', default_handler = str)
dir_name = "D:\\"
test = os.listdir(dir_name)
for item in test:
if item.endswith(".csv"):
os.remove(os.path.join(dir_name, item))
cursor.close()
conn.close()
So, I don't know where I need to add this instruction.
Again thanks so much for all you guys always helping me !!!
Kind regards.

pandas.DataFrame.to_json uses the newline rules of the underlying file object when writing records. If you pass in a file name, pandas will open the file in the default "\n" newline mode. Alternately, you could open the file yourself, choosing the newline policy you want.
import pandas as pd
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])
df.to_json(open("test.json", "w", newline="\r\n"), orient="records", lines=True)
print(open("test.json", "rb").read())
Output
b'{"0":1,"1":2,"2":3}\r\n{"0":4,"1":5,"2":6}\r\n{"0":7,"1":8,"2":9}'
(Note also that lines should be True or False, not bool - which happens to be "truthy" so it works, but not correct).

Related

Full tag in to_xml pandas in any case

I have a code like this:
data_to_string = data_from_table.to_xml(path_or_buffer= file_name + ".xml", index=False,
root_name="convoy", row_name="vehicle", xml_declaration=False)
It works fine with files containing several rows but if there is nothing to export, i have only
<convoy/>
in my .xml.
How can I fix the code to have:
<convoy>
</convoy>
in any cases?
This behavior is hard-coded into the xml-library.
Your requirement can be fulfilled by checking whether there are rows in your dataframe and, if not, by creating an empty html-string (here and here) instead and writing that to the output.
Be aware that <convoy/> and <convoy></convoy> are equivalent in xml.
Here is a fully working example:
import pandas as pd
from lxml import etree
file_name = "test"
path = file_name + ".xml"
root_name = "convoy"
row_name = "vehicle"
data_from_table = pd.DataFrame()
if len(data_from_table.index) > 0:
data_to_string = data_from_table.to_xml(
path_or_buffer=path,
index=False,
root_name=root_name,
row_name=row_name,
xml_declaration=False,
)
else:
elem = etree.Element(root_name)
empty_xml = etree.tostring(elem, method="html", encoding="unicode")
with open(path, "w") as f:
f.write(empty_xml)

Handle big files with python & pandas

Thanks for reading my post.
I need to deal with big files, let me give you more context, I extract some tables from a database convert those tables to CSV and after that, I convert them to JSON.
All that is to send the information to BigQuery.
Now my script works fine but I have a problem, some tables I extract are so so big one of them has 14 Gb, my problem is my server memory just has 8 Gb, exist any way to integrate some to my script to split or append the information ???
My script:
import pyodbc
import fileinput
import csv
import pandas as pd
import json
import os
import sys
conn = pyodbc.connect("Driver={SQL Server};"
"Server=TEST;"
"username=test;"
"password=12345;"
"Database=TEST;"
"Trusted_Connection=no;")
cursor = conn.cursor()
query = "SELECT * FROM placeholder where "
with open(r"D:\Test.txt") as file:
lines = file.readlines()
print(lines)
for user_input in lines:
result = query.replace("placeholder", user_input)
print(result)
sql_query = pd.read_sql(result,conn)
df = pd.DataFrame(sql_query)
user_inputs = user_input.strip("\n")
filename = os.path.join('D:\\', user_inputs + '.csv')
df.to_csv (filename, index = False, encoding='utf-8', sep = '~', quotechar = "`", quoting=csv.QUOTE_ALL)
print(filename)
filename_json = os.path.join('D:\\', user_inputs + '.jsonl')
csvFilePath = (filename)
jsonFilePath = (filename_json)
print(filename_json)
df_o = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df_o.to_json(filename_json, orient = "records", lines = True, date_format = "iso", double_precision = 15, force_ascii = False, date_unit = 'ms', default_handler = str)
dir_name = "D:\\"
test = os.listdir(dir_name)
for item in test:
if item.endswith(".csv"):
os.remove(os.path.join(dir_name, item))
cursor.close()
conn.close()
I'm really new to python, I hope you can help me to integrate some into my script.
Really thanks so many guys !!!
Kind regards.
For large data sets you should avoid reading all of it at once and then writing it all at once. You should do partial reads and partial writes.
Since you are using BigQuery you should use paritions to limit the query output. Have some logic to update the partition offsets. For each partition you can generate one file per parition. In this case your output would be like output-1.csv, output-2.csv etc.
An example of using parition:
SELECT * FROM placeholder
WHERE transaction_date >= '2016-01-01'
As a bonus tip, avoid doing Select * as BigQuery is columnar storage system mentioning the columns you would want to read will significatnly improve the peformance.

Is there Python code to write directly into a SQLite command line? [duplicate]

I have a CSV file and I want to bulk-import this file into my sqlite3 database using Python. the command is ".import .....". but it seems that it cannot work like this. Can anyone give me an example of how to do it in sqlite3? I am using windows just in case.
Thanks
import csv, sqlite3
con = sqlite3.connect(":memory:") # change to 'sqlite:///your_filename.db'
cur = con.cursor()
cur.execute("CREATE TABLE t (col1, col2);") # use your column names here
with open('data.csv','r') as fin: # `with` statement available in 2.5+
# csv.DictReader uses first line in file for column headings by default
dr = csv.DictReader(fin) # comma is default delimiter
to_db = [(i['col1'], i['col2']) for i in dr]
cur.executemany("INSERT INTO t (col1, col2) VALUES (?, ?);", to_db)
con.commit()
con.close()
Creating an sqlite connection to a file on disk is left as an exercise for the reader ... but there is now a two-liner made possible by the pandas library
df = pandas.read_csv(csvfile)
df.to_sql(table_name, conn, if_exists='append', index=False)
You're right that .import is the way to go, but that's a command from the SQLite3 command line program. A lot of the top answers to this question involve native python loops, but if your files are large (mine are 10^6 to 10^7 records), you want to avoid reading everything into pandas or using a native python list comprehension/loop (though I did not time them for comparison).
For large files, I believe the best option is to use subprocess.run() to execute sqlite's import command. In the example below, I assume the table already exists, but the csv file has headers in the first row. See .import docs for more info.
subprocess.run()
from pathlib import Path
db_name = Path('my.db').resolve()
csv_file = Path('file.csv').resolve()
result = subprocess.run(['sqlite3',
str(db_name),
'-cmd',
'.mode csv',
'.import --skip 1 ' + str(csv_file).replace('\\','\\\\')
+' <table_name>'],
capture_output=True)
edit note: sqlite3's .import command has improved so that it can treat the first row as header names or even skip the first x rows (requires version >=3.32, as noted in this answer. If you have an older version of sqlite3, you may need to first create the table, then strip off the first row of the csv before importing. The --skip 1 argument will give an error prior to 3.32
Explanation
From the command line, the command you're looking for is sqlite3 my.db -cmd ".mode csv" ".import file.csv table". subprocess.run() runs a command line process. The argument to subprocess.run() is a sequence of strings which are interpreted as a command followed by all of it's arguments.
sqlite3 my.db opens the database
-cmd flag after the database allows you to pass multiple follow on commands to the sqlite program. In the shell, each command has to be in quotes, but here, they just need to be their own element of the sequence
'.mode csv' does what you'd expect
'.import --skip 1'+str(csv_file).replace('\\','\\\\')+' <table_name>' is the import command.
Unfortunately, since subprocess passes all follow-ons to -cmd as quoted strings, you need to double up your backslashes if you have a windows directory path.
Stripping Headers
Not really the main point of the question, but here's what I used. Again, I didn't want to read the whole files into memory at any point:
with open(csv, "r") as source:
source.readline()
with open(str(csv)+"_nohead", "w") as target:
shutil.copyfileobj(source, target)
My 2 cents (more generic):
import csv, sqlite3
import logging
def _get_col_datatypes(fin):
dr = csv.DictReader(fin) # comma is default delimiter
fieldTypes = {}
for entry in dr:
feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]
if not feildslLeft: break # We're done
for field in feildslLeft:
data = entry[field]
# Need data to decide
if len(data) == 0:
continue
if data.isdigit():
fieldTypes[field] = "INTEGER"
else:
fieldTypes[field] = "TEXT"
# TODO: Currently there's no support for DATE in sqllite
if len(feildslLeft) > 0:
raise Exception("Failed to find all the columns data types - Maybe some are empty?")
return fieldTypes
def escapingGenerator(f):
for line in f:
yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")
def csvToDb(csvFile, outputToFile = False):
# TODO: implement output to file
with open(csvFile,mode='r', encoding="ISO-8859-1") as fin:
dt = _get_col_datatypes(fin)
fin.seek(0)
reader = csv.DictReader(fin)
# Keep the order of the columns name just as in the CSV
fields = reader.fieldnames
cols = []
# Set field and type
for f in fields:
cols.append("%s %s" % (f, dt[f]))
# Generate create table statement:
stmt = "CREATE TABLE ads (%s)" % ",".join(cols)
con = sqlite3.connect(":memory:")
cur = con.cursor()
cur.execute(stmt)
fin.seek(0)
reader = csv.reader(escapingGenerator(fin))
# Generate insert statement:
stmt = "INSERT INTO ads VALUES(%s);" % ','.join('?' * len(cols))
cur.executemany(stmt, reader)
con.commit()
return con
The .import command is a feature of the sqlite3 command-line tool. To do it in Python, you should simply load the data using whatever facilities Python has, such as the csv module, and inserting the data as per usual.
This way, you also have control over what types are inserted, rather than relying on sqlite3's seemingly undocumented behaviour.
Many thanks for bernie's answer! Had to tweak it a bit - here's what worked for me:
import csv, sqlite3
conn = sqlite3.connect("pcfc.sl3")
curs = conn.cursor()
curs.execute("CREATE TABLE PCFC (id INTEGER PRIMARY KEY, type INTEGER, term TEXT, definition TEXT);")
reader = csv.reader(open('PC.txt', 'r'), delimiter='|')
for row in reader:
to_db = [unicode(row[0], "utf8"), unicode(row[1], "utf8"), unicode(row[2], "utf8")]
curs.execute("INSERT INTO PCFC (type, term, definition) VALUES (?, ?, ?);", to_db)
conn.commit()
My text file (PC.txt) looks like this:
1 | Term 1 | Definition 1
2 | Term 2 | Definition 2
3 | Term 3 | Definition 3
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys, csv, sqlite3
def main():
con = sqlite3.connect(sys.argv[1]) # database file input
cur = con.cursor()
cur.executescript("""
DROP TABLE IF EXISTS t;
CREATE TABLE t (COL1 TEXT, COL2 TEXT);
""") # checks to see if table exists and makes a fresh table.
with open(sys.argv[2], "rb") as f: # CSV file input
reader = csv.reader(f, delimiter=',') # no header information with delimiter
for row in reader:
to_db = [unicode(row[0], "utf8"), unicode(row[1], "utf8")] # Appends data from CSV file representing and handling of text
cur.execute("INSERT INTO neto (COL1, COL2) VALUES(?, ?);", to_db)
con.commit()
con.close() # closes connection to database
if __name__=='__main__':
main()
"""
cd Final_Codes
python csv_to_db.py
CSV to SQL DB
"""
import csv
import sqlite3
import os
import fnmatch
UP_FOLDER = os.path.dirname(os.getcwd())
DATABASE_FOLDER = os.path.join(UP_FOLDER, "Databases")
DBNAME = "allCompanies_database.db"
def getBaseNameNoExt(givenPath):
"""Returns the basename of the file without the extension"""
filename = os.path.splitext(os.path.basename(givenPath))[0]
return filename
def find(pattern, path):
"""Utility to find files wrt a regex search"""
result = []
for root, dirs, files in os.walk(path):
for name in files:
if fnmatch.fnmatch(name, pattern):
result.append(os.path.join(root, name))
return result
if __name__ == "__main__":
Database_Path = os.path.join(DATABASE_FOLDER, DBNAME)
# change to 'sqlite:///your_filename.db'
csv_files = find('*.csv', DATABASE_FOLDER)
con = sqlite3.connect(Database_Path)
cur = con.cursor()
for each in csv_files:
with open(each, 'r') as fin: # `with` statement available in 2.5+
# csv.DictReader uses first line in file for column headings by default
dr = csv.DictReader(fin) # comma is default delimiter
TABLE_NAME = getBaseNameNoExt(each)
Cols = dr.fieldnames
numCols = len(Cols)
"""
for i in dr:
print(i.values())
"""
to_db = [tuple(i.values()) for i in dr]
print(TABLE_NAME)
# use your column names here
ColString = ','.join(Cols)
QuestionMarks = ["?"] * numCols
ToAdd = ','.join(QuestionMarks)
cur.execute(f"CREATE TABLE {TABLE_NAME} ({ColString});")
cur.executemany(
f"INSERT INTO {TABLE_NAME} ({ColString}) VALUES ({ToAdd});", to_db)
con.commit()
con.close()
print("Execution Complete!")
This should come in handy when you have a lot of csv files in a folder which you wish to convert to a single .db file in a go!
Notice that you dont have to know the filenames, tablenames or fieldnames (column names) beforehand!
If the CSV file must be imported as part of a python program, then for simplicity and efficiency, you could use os.system along the lines suggested by the following:
import os
cmd = """sqlite3 database.db <<< ".import input.csv mytable" """
rc = os.system(cmd)
print(rc)
The point is that by specifying the filename of the database, the data will automatically be saved, assuming there are no errors reading it.
Here are solutions that'll work if your CSV file is really big. Use to_sql as suggested by another answer, but set chunksize so it doesn't try to process the whole file at once.
import sqlite3
import pandas as pd
conn = sqlite3.connect('my_data.db')
c = conn.cursor()
users = pd.read_csv('users.csv')
users.to_sql('users', conn, if_exists='append', index = False, chunksize = 10000)
You can also use Dask, as described here to write a lot of Pandas DataFrames in parallel:
dto_sql = dask.delayed(pd.DataFrame.to_sql)
out = [dto_sql(d, 'table_name', db_url, if_exists='append', index=True)
for d in ddf.to_delayed()]
dask.compute(*out)
See here for more details.
Based on Guy L solution (Love it) but can handle escaped fields.
import csv, sqlite3
def _get_col_datatypes(fin):
dr = csv.DictReader(fin) # comma is default delimiter
fieldTypes = {}
for entry in dr:
feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]
if not feildslLeft: break # We're done
for field in feildslLeft:
data = entry[field]
# Need data to decide
if len(data) == 0:
continue
if data.isdigit():
fieldTypes[field] = "INTEGER"
else:
fieldTypes[field] = "TEXT"
# TODO: Currently there's no support for DATE in sqllite
if len(feildslLeft) > 0:
raise Exception("Failed to find all the columns data types - Maybe some are empty?")
return fieldTypes
def escapingGenerator(f):
for line in f:
yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")
def csvToDb(csvFile,dbFile,tablename, outputToFile = False):
# TODO: implement output to file
with open(csvFile,mode='r', encoding="ISO-8859-1") as fin:
dt = _get_col_datatypes(fin)
fin.seek(0)
reader = csv.DictReader(fin)
# Keep the order of the columns name just as in the CSV
fields = reader.fieldnames
cols = []
# Set field and type
for f in fields:
cols.append("\"%s\" %s" % (f, dt[f]))
# Generate create table statement:
stmt = "create table if not exists \"" + tablename + "\" (%s)" % ",".join(cols)
print(stmt)
con = sqlite3.connect(dbFile)
cur = con.cursor()
cur.execute(stmt)
fin.seek(0)
reader = csv.reader(escapingGenerator(fin))
# Generate insert statement:
stmt = "INSERT INTO \"" + tablename + "\" VALUES(%s);" % ','.join('?' * len(cols))
cur.executemany(stmt, reader)
con.commit()
con.close()
You can do this using blaze & odo efficiently
import blaze as bz
csv_path = 'data.csv'
bz.odo(csv_path, 'sqlite:///data.db::data')
Odo will store the csv file to data.db (sqlite database) under the schema data
Or you use odo directly, without blaze. Either ways is fine. Read this documentation
The following can also add fields' name based on the CSV header:
import sqlite3
def csv_sql(file_dir,table_name,database_name):
con = sqlite3.connect(database_name)
cur = con.cursor()
# Drop the current table by:
# cur.execute("DROP TABLE IF EXISTS %s;" % table_name)
with open(file_dir, 'r') as fl:
hd = fl.readline()[:-1].split(',')
ro = fl.readlines()
db = [tuple(ro[i][:-1].split(',')) for i in range(len(ro))]
header = ','.join(hd)
cur.execute("CREATE TABLE IF NOT EXISTS %s (%s);" % (table_name,header))
cur.executemany("INSERT INTO %s (%s) VALUES (%s);" % (table_name,header,('?,'*len(hd))[:-1]), db)
con.commit()
con.close()
# Example:
csv_sql('./surveys.csv','survey','eco.db')
in the interest of simplicity, you could use the sqlite3 command line tool from the Makefile of your project.
%.sql3: %.csv
rm -f $#
sqlite3 $# -echo -cmd ".mode csv" ".import $< $*"
%.dump: %.sql3
sqlite3 $< "select * from $*"
make test.sql3 then creates the sqlite database from an existing test.csv file, with a single table "test". you can then make test.dump to verify the contents.
With this you can do joins on CSVs as well:
import sqlite3
import os
import pandas as pd
from typing import List
class CSVDriver:
def __init__(self, table_dir_path: str):
self.table_dir_path = table_dir_path # where tables (ie. csv files) are located
self._con = None
#property
def con(self) -> sqlite3.Connection:
"""Make a singleton connection to an in-memory SQLite database"""
if not self._con:
self._con = sqlite3.connect(":memory:")
return self._con
def _exists(self, table: str) -> bool:
query = """
SELECT name
FROM sqlite_master
WHERE type ='table'
AND name NOT LIKE 'sqlite_%';
"""
tables = self.con.execute(query).fetchall()
return table in tables
def _load_table_to_mem(self, table: str, sep: str = None) -> None:
"""
Load a CSV into an in-memory SQLite database
sep is set to None in order to force pandas to auto-detect the delimiter
"""
if self._exists(table):
return
file_name = table + ".csv"
path = os.path.join(self.table_dir_path, file_name)
if not os.path.exists(path):
raise ValueError(f"CSV table {table} does not exist in {self.table_dir_path}")
df = pd.read_csv(path, sep=sep, engine="python") # set engine to python to skip pandas' warning
df.to_sql(table, self.con, if_exists='replace', index=False, chunksize=10000)
def query(self, query: str) -> List[tuple]:
"""
Run an SQL query on CSV file(s).
Tables are loaded from table_dir_path
"""
tables = extract_tables(query)
for table in tables:
self._load_table_to_mem(table)
cursor = self.con.cursor()
cursor.execute(query)
records = cursor.fetchall()
return records
extract_tables():
import sqlparse
from sqlparse.sql import IdentifierList, Identifier, Function
from sqlparse.tokens import Keyword, DML
from collections import namedtuple
import itertools
class Reference(namedtuple('Reference', ['schema', 'name', 'alias', 'is_function'])):
__slots__ = ()
def has_alias(self):
return self.alias is not None
#property
def is_query_alias(self):
return self.name is None and self.alias is not None
#property
def is_table_alias(self):
return self.name is not None and self.alias is not None and not self.is_function
#property
def full_name(self):
if self.schema is None:
return self.name
else:
return self.schema + '.' + self.name
def _is_subselect(parsed):
if not parsed.is_group:
return False
for item in parsed.tokens:
if item.ttype is DML and item.value.upper() in ('SELECT', 'INSERT',
'UPDATE', 'CREATE', 'DELETE'):
return True
return False
def _identifier_is_function(identifier):
return any(isinstance(t, Function) for t in identifier.tokens)
def _extract_from_part(parsed):
tbl_prefix_seen = False
for item in parsed.tokens:
if item.is_group:
for x in _extract_from_part(item):
yield x
if tbl_prefix_seen:
if _is_subselect(item):
for x in _extract_from_part(item):
yield x
# An incomplete nested select won't be recognized correctly as a
# sub-select. eg: 'SELECT * FROM (SELECT id FROM user'. This causes
# the second FROM to trigger this elif condition resulting in a
# StopIteration. So we need to ignore the keyword if the keyword
# FROM.
# Also 'SELECT * FROM abc JOIN def' will trigger this elif
# condition. So we need to ignore the keyword JOIN and its variants
# INNER JOIN, FULL OUTER JOIN, etc.
elif item.ttype is Keyword and (
not item.value.upper() == 'FROM') and (
not item.value.upper().endswith('JOIN')):
tbl_prefix_seen = False
else:
yield item
elif item.ttype is Keyword or item.ttype is Keyword.DML:
item_val = item.value.upper()
if (item_val in ('COPY', 'FROM', 'INTO', 'UPDATE', 'TABLE') or
item_val.endswith('JOIN')):
tbl_prefix_seen = True
# 'SELECT a, FROM abc' will detect FROM as part of the column list.
# So this check here is necessary.
elif isinstance(item, IdentifierList):
for identifier in item.get_identifiers():
if (identifier.ttype is Keyword and
identifier.value.upper() == 'FROM'):
tbl_prefix_seen = True
break
def _extract_table_identifiers(token_stream):
for item in token_stream:
if isinstance(item, IdentifierList):
for ident in item.get_identifiers():
try:
alias = ident.get_alias()
schema_name = ident.get_parent_name()
real_name = ident.get_real_name()
except AttributeError:
continue
if real_name:
yield Reference(schema_name, real_name,
alias, _identifier_is_function(ident))
elif isinstance(item, Identifier):
yield Reference(item.get_parent_name(), item.get_real_name(),
item.get_alias(), _identifier_is_function(item))
elif isinstance(item, Function):
yield Reference(item.get_parent_name(), item.get_real_name(),
item.get_alias(), _identifier_is_function(item))
def extract_tables(sql):
# let's handle multiple statements in one sql string
extracted_tables = []
statements = list(sqlparse.parse(sql))
for statement in statements:
stream = _extract_from_part(statement)
extracted_tables.append([ref.name for ref in _extract_table_identifiers(stream)])
return list(itertools.chain(*extracted_tables))
Example (assuming account.csv and tojoin.csv exist in /path/to/files):
db_path = r"/path/to/files"
driver = CSVDriver(db_path)
query = """
SELECT tojoin.col_to_join
FROM account
LEFT JOIN tojoin
ON account.a = tojoin.a
"""
driver.query(query)
import csv, sqlite3
def _get_col_datatypes(fin):
dr = csv.DictReader(fin) # comma is default delimiter
fieldTypes = {}
for entry in dr:
feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]
if not feildslLeft: break # We're done
for field in feildslLeft:
data = entry[field]
# Need data to decide
if len(data) == 0:
continue
if data.isdigit():
fieldTypes[field] = "INTEGER"
else:
fieldTypes[field] = "TEXT"
# TODO: Currently there's no support for DATE in sqllite
if len(feildslLeft) > 0:
raise Exception("Failed to find all the columns data types - Maybe some are empty?")
return fieldTypes
def escapingGenerator(f):
for line in f:
yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")
def csvToDb(csvFile,dbFile,tablename, outputToFile = False):
# TODO: implement output to file
with open(csvFile,mode='r', encoding="ISO-8859-1") as fin:
dt = _get_col_datatypes(fin)
fin.seek(0)
reader = csv.DictReader(fin)
# Keep the order of the columns name just as in the CSV
fields = reader.fieldnames
cols = []
# Set field and type
for f in fields:
cols.append("\"%s\" %s" % (f, dt[f]))
# Generate create table statement:
stmt = "create table if not exists \"" + tablename + "\" (%s)" % ",".join(cols)
print(stmt)
con = sqlite3.connect(dbFile)
cur = con.cursor()
cur.execute(stmt)
fin.seek(0)
reader = csv.reader(escapingGenerator(fin))
# Generate insert statement:
stmt = "INSERT INTO \"" + tablename + "\" VALUES(%s);" % ','.join('?' * len(cols))
cur.executemany(stmt, reader)
con.commit()
con.close()
I've found that it can be necessary to break up the transfer of data from the csv to the database in chunks as to not run out of memory. This can be done like this:
import csv
import sqlite3
from operator import itemgetter
# Establish connection
conn = sqlite3.connect("mydb.db")
# Create the table
conn.execute(
"""
CREATE TABLE persons(
person_id INTEGER,
last_name TEXT,
first_name TEXT,
address TEXT
)
"""
)
# These are the columns from the csv that we want
cols = ["person_id", "last_name", "first_name", "address"]
# If the csv file is huge, we instead add the data in chunks
chunksize = 10000
# Parse csv file and populate db in chunks
with conn, open("persons.csv") as f:
reader = csv.DictReader(f)
chunk = []
for i, row in reader:
if i % chunksize == 0 and i > 0:
conn.executemany(
"""
INSERT INTO persons
VALUES(?, ?, ?, ?)
""", chunk
)
chunk = []
items = itemgetter(*cols)(row)
chunk.append(items)
Here is my version, works already by asking you to select the '.csv' file you want to convert
from multiprocessing import current_process
import pandas as pd
import sqlite3
import os
from tkinter import Tk
from tkinter.filedialog import askopenfilename
from pathlib import Path
def csv_to_db(csv_filedir):
if not Path(csv_filedir).is_file(): # if needed ask for user input of CVS file
current_path = os.getcwd()
Tk().withdraw()
csv_filedir = askopenfilename(initialdir=current_path)
try:
data = pd.read_csv(csv_filedir) # load CSV file
except:
print("Something went wrong when opening to the file")
print(csv_filedir)
csv_df = pd.DataFrame(data)
csv_df = csv_df.fillna('NULL') # make NaN = to 'NULL' for SQL format
[path,filename] = os.path.split(csv_filedir) # define path and filename
[filename,_] = os.path.splitext(filename)
database_filedir = os.path.join(path, filename + '.db')
conn = sqlite3.connect(database_filedir) # connect to SQL server
[fields_sql, header_sql_string] = create_sql_fields(csv_df)
# CREATE EMPTY DATABASE
create_sql = ''.join(['CREATE TABLE IF NOT EXISTS ' + filename + ' (' + fields_sql + ')'])
cursor = conn.cursor()
cursor.execute(create_sql)
# INSERT EACH ROW IN THE SQL DATABASE
for irow in csv_df.itertuples():
insert_values_string = ''.join(['INSERT INTO ', filename, header_sql_string, ' VALUES ('])
insert_sql = f"{insert_values_string} {irow[1]}, '{irow[2]}','{irow[3]}', {irow[4]}, '{irow[5]}' )"
print(insert_sql)
cursor.execute(insert_sql)
# COMMIT CHANGES TO DATABASE AND CLOSE CONNECTION
conn.commit()
conn.close()
print('\n' + csv_filedir + ' \n converted to \n' + database_filedir)
return database_filedir
def create_sql_fields(df): # gather the headers of the CSV and create two strings
fields_sql = [] # str1 = var1 TYPE, va2, TYPE ...
header_names = [] # str2 = var1, var2, var3, var4
for col in range(0,len(df.columns)):
fields_sql.append(df.columns[col])
fields_sql.append(str(df.dtypes[col]))
header_names.append(df.columns[col])
if col != len(df.columns)-1:
fields_sql.append(',')
header_names.append(',')
fields_sql = ' '.join(fields_sql)
fields_sql = fields_sql.replace('int64','integer')
fields_sql = fields_sql.replace('float64','integer')
fields_sql = fields_sql.replace('object','text')
header_sql_string = '(' + ''.join(header_names) + ')'
return fields_sql, header_sql_string
csv_to_db('')

How to save multiple csv files using to_csv in python?

Need to create multiple CSV files at the location
csv_data="D:\Desktop\DS\DSDA\Assignment\"
for a in range(60):
csv_data = csv_data + str(a)
csv_data = DataFrameDict[a].to_csv(csv_data.csv, index = False, header=True)`
csv_data = ""
I have tried:
file.with_suffix('.csv'), index = False) also didnt work
First/better path for me would be to use pathlib; however, let's stick to ur question. I would suggest adding the csv to the string addition:
csv_data="D:\Desktop\DS\DSDA\Assignment\"
for a in range(60):
#add the '.csv' here
csv_data = csv_data + str(a)+'.csv'
csv_data = DataFrameDict[a].to_csv(csv_data, index = False, header=True)`
#what is this line for?
csv_data = ""
let's see if it works
This code is able to create 60 CSV files using one dataframe:
csv_data="D:\Desktop\DS\DSDA\Assignment\"
for a in range(60):
csv_data = csv_data + str(a)
csv_data = DataFrameDict[a].to_csv(csv_data+str(".csv"), index = False, header=True)
csv_data = ""

How to stop writing a blank line at the end of csv file - pandas

When saving the data to csv, data.to_csv('csv_data', sep=',', encoding='utf-8', header= False, index = False), it creates a blank line at the end of csv file.
How do you avoid that?
It's got to do with the line_terminator and it's default value is n, for new line.
Is there a way to specify the line_terminator to avoid creating a blank line at the end, or do i need to read the csv file, remove the blank line and save it?
Not familiar with pandas. Your help will be appreciated, thanks in advance!
One way would be to save data except the last entry,with default line_terminator(\n) and append the last line with line_terminator="" .
data1 = data.iloc[0:len(data)-1]
data2 = data.iloc[[len(data)-1]]
data1.to_csv('csv_data', sep=',', encoding='utf-8', header= False, index = False)
data2.to_csv('csv_data', sep=',', encoding='utf-8', header= False, index = False,mode='a',line_terminator="")
For some reason, the line terminator did not work when I tried it. (It gave an error, saying line_terminator is an unrecognized keyword argument.)
However, this will do the trick:
df.to_csv(path)
with open(path) as f:
lines = f.readlines()
last = len(lines) - 1
lines[last] = lines[last].replace('\r','').replace('\n','')
with open(path, 'w') as wr:
wr.writelines(lines)
file_out = r'c:\your_output_file_path\file_name.csv'
df.to_csv(file_out)
file_data = open(file_out, 'rb').read()
open(file_out, 'wb').write(file_data[:-2])
df.to_csv() function has a parameter called line_terminator with a default value of '\n'. This new line character is the issue at hand.
The code above:
1) writes the dataframe to file as normal
2) opens the file and reads in the bytes data to the file_data variable
3) writes the file_data variable back out to the same file but trims off the '\n' with the splice: file_data[:-2]
One solution is to not use pandas to export the data to a file. The example below will not include an empty row at the end of the file. However, it's probably a lot slower than pandas' "to_csv" method.
import pandas as pd
def export_dataframe_to_file(
df: pd.DataFrame, file_name: str,
header=True, index=True, delimiter=',',
line_terminator='\n', encoding='utf-8'
) -> None:
'''
This function exports a Pandas DataFrame to a file without
including an empty row at the very end of the file.
'''
number_of_rows, current_row = len(df), 1
with open(file_name, 'w', encoding=encoding) as file :
if header:
file.write(
delimiter*index + delimiter.join(df.columns) \
+ line_terminator
)
for df_index, series in df.iterrows():
file.write(
(str(df_index) + delimiter)*index \
+ delimiter.join(series.astype( str )) \
+ line_terminator*(not not number_of_rows - current_row)
)
current_row += 1
return
None of the solutions above worked because as the original question asked he was trying to send the file to another script/REST API that wouldn't accept carriage return. This is likely caused by the requests library he is using to send the csv file to the REST API. I was able to use the requests library to send a file which had a carriage return via REST API:
import requests
import pandas as pd
import settings
file_name = Hierarchy.csv'
df = pd.read_csv(file_name)
df.to_csv(file_name, sep=',', encoding='utf-8', index=False)
headers = {
'x-api-key': settings.MONITOR_REST_API_KEY,
'x-api-token': settings.MONITOR_REST_API_TOKEN,
'accept': 'application/json'
}
files = {'file': (file_name, open(file_name, 'rb'), 'text/csv')}
monitor_rest_url = "https://api.yourcloud.com"
response = requests.post(monitor_rest_url +'/api/v2/your_endpoint',
files=files, verify=False, headers=headers)
print(response.text)
A more efficient way is to open the file first, write to that stream, then remove the last newline:
import os
with open('csv_data', 'wb') as dst:
data.to_csv(wb, sep=',', encoding='utf-8', header= False, index = False)
dst.seek(-1, os.SEEK_END) # <---- 1 : len('\n')
dst.truncate()

Categories