remove \xa0 while writing csv file - python

I have below code to write a list items as csv file. But while doing that, I see special character  is occurring in the output csv file. For testing in the local, I have defined a list with character '\xao' included in the list items and tested various ways to remove that and replace with space. But, I still get the special character in the csv output. Can anyone help?
import csv
from flask import make_response
import StringIO
csv_list = [['hfhf\xa0 fsdg','dsf'],['fsdgs fsdfs','fsdfsd'],['dsf\xa0 sf','asfg']]
def download_csv_summary(csv_list):
si = StringIO.StringIO()
cw = csv.writer(si)
filename = 'Test'
cw.writerows(csv_list)
output = make_response(si.getvalue())
output.headers['Content-Disposition'] = \
'attachment; filename={filename}.csv'.format(filename=filename)
output.headers['Content-Type'] = 'text/csv'
return output

The writerows line needs to be changed to replace the \xa0s:
import csv
from flask import make_response
import StringIO
csv_list = [
['hfhf\xa0 fsdg','dsf'],
['fsdgs fsdfs','fsdfsd'],
['dsf\xa0 sf','asfg']
]
def download_csv_summary(csv_list):
si = StringIO.StringIO()
cw = csv.writer(si)
filename = 'Test'
cw.writerows([[str(x).replace('\xa0', '') for x in l] for l in csv_list])
output = make_response(si.getvalue())
output.headers['Content-Disposition'] = \
'attachment; filename={filename}.csv'.format(filename=filename)
output.headers['Content-Type'] = 'text/csv'
return output

Related

replace('\n','') does not work for import to .txt format

The place where I put this code belongs to an import page.And here there is data in the data I want to import in .txt format, but this data contains the \n character.
if request.method == "POST":
txt_file = request.FILES['file']
if not txt_file .name.endswith('.txt'):
messages.info(request,'This is not a txt file')
data_set = csv_file.read().decode('latin-1')
io_string = io.StringIO(data_set)
next(io_string)
csv_reader = csv.reader(io_string, delimiter='\t',quotechar="|")
for column in csv_reader:
b = Module_Name(
user= request.user,
a = column[1],
b = column[2],
c = column[3],
d = column[4],
e = column[5],
f = column[6],
g = column[7],
h = column[8],
)
b.save()
messages.success(request,"Successfully Imported...")
return redirect("return:return_import")
This can be called the full version of my code. To explain, there is a \n character in the data that comes here as column[1]. This file is a .txt file from another export. And in this export column[1];
This is
a value
and my django localhost new-line character seen in unquoted field - do you need to open the file in universal-newline mode? gives a warning and aborts the import to the system.
the csv reader iterates over rows, not columns. So if you want to append the data from a given column together, you must iterate over all the rows first. For example:
import csv
from io import StringIO
io_string = "this is , r0 c1\r\na value, r1 c2\r\n"
io_string = StringIO(io_string)
rows = csv.reader(io_string)
column_0_data = []
for row in rows:
column_0_data.append(row[0])
print("".join(column_0_data))
the rest of your code looks iffy to me, but that is off topic.

Defining a list within the code versus reading it from a file

I am trying to count the number of specific words in a given report. Does anyone know why defining a list within the code makes the second part of the following code run faster than reading the list from a file? Is there a solution? The list contains the same words is a lot longer than two words in the following example.
# Example code: Within code list
import csv
import glob
import re
import time
TARGET_FILES = r'C:/Users/s170760/Desktop/Reports_Cleaned/*.*'
OUTPUT_FILE = r'C:/Users/s170760/Desktop/Parser.csv'
OUTPUT_FIELDS = ['file name', 'create']
create = {'agile', 'skills'}
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 2
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in create:
_odata[1] += 1
return _odata
Here is the other way:
# Example code: Reading list from a file
import csv
import glob
import re
import time
TARGET_FILES = r'C:/Users/s170760/Desktop/Reports_Cleaned/*.*'
OUTPUT_FILE = r'C:/Users/s170760/Desktop/Parser.csv'
OUTPUT_FIELDS = ['file name', 'create']
create = open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines()
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 2
tokens = re.findall('\w(?:[-\w]*\w)?', doc)
for token in tokens:
if token in create:
_odata[1] += 1
return _odata
As pointed out by Mark in the comments, the first code snippet uses a set of strings, while the second code snippet loads a file into a list of strings.
Why sets are faster than lists in this use case, is well explained in this Stack Overflow answer. Parsing the output of open to a set can indeed solve your problem.
So replace:
create = open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines()
With:
create = set(open('C:/Users/s170760/Desktop/Create.txt', 'r').read().splitlines())

How to write Arabic to a CSV file

I am trying to extract tweets with Python and store them in a CSV file, but I can't seem to include all languages. Arabic appears as special characters.
def recup_all_tweets(screen_name,api):
all_tweets = []
new_tweets = api.user_timeline(screen_name,count=300)
all_tweets.extend(new_tweets)
#outtweets = [[tweet.id_str, tweet.created_at, tweet.text,tweet.retweet_count,get_hashtagslist(tweet.text)] for tweet in all_tweets]
outtweets = [[tweet.text,tweet.entities['hashtags']] for tweet in all_tweets]
# with open('recup_all_tweets.json', 'w', encoding='utf-8') as f:
# f.write(json.dumps(outtweets, indent=4, sort_keys=True))
with open('recup_all_tweets.csv', 'w',encoding='utf-8') as f:
writer = csv.writer(f,delimiter=',')
writer.writerow(["text","tag"])
writer.writerows(outtweets)
# pass
return(outtweets)
Example of writing both CSV and JSON:
#coding:utf8
import csv
import json
s = ['عربى','عربى','عربى']
with open('output.csv','w',encoding='utf-8-sig',newline='') as f:
r = csv.writer(f)
r.writerow(['header1','header2','header3'])
r.writerow(s)
with open('output.json','w',encoding='utf8') as f:
json.dump(s,f,ensure_ascii=False)
output.csv:
header1,header2,header3
عربى,عربى,عربى
output.csv viewed in Excel:
output.json:
["عربى", "عربى", "عربى"]
Note Microsoft Excel needs utf-8-sig to read a UTF-8 file properly. Other applications may or may not need it to view properly. Many Windows applications required a UTF-8 "BOM" signature at the start of a text file or will assume an ANSI encoding instead. The ANSI encoding varies depending on the localized version of Windows used.
Maybe try with
f.write(json.dumps(outtweets, indent=4, sort_keys=True, ensure_ascii=False))
I searched a lot and finally wrote the following piece of code:
import arabic_reshaper
from bidi.algorithm import get_display
import numpy as np
itemsX = webdriver.find_elements(By.CLASS_NAME,"x1i10hfl")
item_linksX = [itemX.get_attribute("href") for itemX in itemsX]
item_linksX = filter(lambda k: '/p/' in k, item_linksX)
counter = 0
for item_linkX in item_linksX:
AllComments2 = []
counter = counter + 1
webdriver.get(item_linkX)
print(item_linkX)
sleep(11)
comments = webdriver.find_elements(By.CLASS_NAME,"_aacl")
for comment in comments:
try:
reshaped_text = arabic_reshaper.reshape(comment.text)
bidi_text = get_display(reshaped_text)
AllComments2.append(reshaped_text)
except:
pass
df = pd.DataFrame({'col':AllComments2})
df.to_csv('C:\Crawler\Comments' + str(counter) + '.csv', sep='\t', encoding='utf-16')
This code worked perfectly for me. I hope it helps those who haven't used the code from the previous post

Need to combine and convert data in paragraph format to csv

I am new to Python and am starting some online courses. I am trying to convert some data from a paragraph format to CSV format (shown below.) I am able to import a text file containing the paragraph format and export that to CSV but each line in the paragraph format comes in as a single line when imported into a spreadsheet.
import csv
import glob
import os
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder:")
txt_files = os.path.join(directory, '*.txt')
for txt_file in glob.glob(txt_files):
with open(txt_file, "rb") as input_file:
in_txt = csv.reader(input_file, delimiter='=')
filename = os.path.splitext(os.path.basename(txt_file))[0] + '.csv'
with open(os.path.join(output, filename), 'wb') as output_file:
out_csv = csv.writer(output_file)
out_csv.writerows(in_txt)
I do not know how to parse the data to separate the labels and spaces from the numeric values and combine each paragraph section into a single line with quotes and commas for the CSV file. Any help would be greatly appreciated!
Paragraph format:
12-03-06 15:19:36
FLOW: 1.17365 g/m
POS: +9273x1Gal
12-03-06 15:19:37
FLOW: 1.17849 g/m
POS: +9283x1Gal
12-03-06 15:19:38
FLOW: 1.19849 g/m
POS: +9293x1Gal
(repeats)
Desired CSV output (note, I had to add a single quote before the + to allow proper import as text into a spreadsheet, otherwise it comes in as a 0)
"12-03-06 15:19:36","FLOW:","1.17365","g/m","POS:","'+","9273","x1","Gal"
"12-03-06 15:19:37","FLOW:","1.17849","g/m","POS:","'+","9283","x1","Gal"
"12-03-06 15:19:38","FLOW:","1.19849","g/m","POS:","'+","9293","x1","Gal"
I suggest using a collections.deque to work on three lines at a time, and re.match to parse out the items you want:
# -*- coding: utf-8 -*-
from collections import deque
import csv
from functools import partial
import glob
import os
import re
import sys
if sys.hexversion < 0x3000000:
# Python 2.x
inp = raw_input
open_csv_write = partial(open, mode="wb")
else:
# Python 3.x
inp = input
open_csv_write = partial(open, mode="w", newline="")
POS_REG = re.compile("(POS:) ([+-])(\d+(?:\.\d+)?)(x\d+)(\w+)", re.I)
def change_ext(fn, new_ext):
"""
Given `fn` as "path\filename.old_ext",
return "path\filename" + new_ext
"""
return os.path.splitext(fn)[0] + new_ext
def get_pos(line, reg=POS_REG):
"""
Given a string like "POS: +92.73x1Gal",
return ['POS:', '+', '92.73', 'x1', 'Gal']
"""
match = reg.match(line)
return list(match.groups()) if match else []
def process(inf, outcsv):
# line queue
q = deque(maxlen=3)
# preload two lines
q.append(next(inf, '').rstrip())
q.append(next(inf, '').rstrip())
# process rest of lines
for line in inf:
q.append(line.rstrip())
if q[1].startswith('FLOW:'):
pos = get_pos(line)
if pos:
row = [q[0]] + q[1].split() + pos
outcsv.writerow(row)
def main():
# get directories
in_dir = inp("Input directory: ")
out_dir = inp("Output directory: ")
# process file names
in_filespec = os.path.join(in_dir, '*.txt')
in_full_names = glob.glob(in_filespec)
in_names = [os.path.basename(fn) for fn in in_full_names]
out_names = [change_ext(fn, ".csv") for fn in in_names]
out_full_names = [os.path.join(out_dir, fn) for fn in out_names]
# operate on files
for in_name, out_name in zip(in_full_names, out_full_names):
with open(in_name) as inf, open_csv_write(out_name) as outf:
outcsv = csv.writer(outf)
process(inf, outcsv)
if __name__ == "__main__":
main()

Joining Multiple Lines in Python (Text Formatting)

I am working on pulling logs through an web API and so far when pulling the logs they return in the following format (3 events below starting with and ending with . My question is what would be the best way to loop through each line and concatenate them so that the result event looks like below.
Current output
<attack_headlines version="1.0.1">
<attack_headline>
<site_id>1</site_id>
<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category>
<subcategory>SUlTIEhlbHA=</subcategory>
<client_ip>172.17.1.126</client_ip>
<date>1363735940</date>
<gmt_diff>0</gmt_diff>
<reference_id>6D13-DE3D-9539-8980</reference_id>
</attack_headline>
</attack_headlines>
<attack_headlines version="1.0.1">
<attack_headline>
<site_id>1</site_id>
<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category>
<subcategory>SUlTIEhlbHA=</subcategory>
<client_ip>172.17.1.136</client_ip>
<date>1363735971</date>
<gmt_diff>0</gmt_diff>
<reference_id>6D13-DE3D-9539-8981</reference_id>
</attack_headline>
</attack_headlines>
<attack_headlines version="1.0.1">
<attack_headline>
<site_id>1</site_id>
<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category>
<subcategory>SUlTIEhlbHA=</subcategory>
<client_ip>172.17.1.156</client_ip>
<date>1363735975</date>
<gmt_diff>0</gmt_diff>
<reference_id>6D13-DE3D-9539-8982</reference_id>
</attack_headline>
</attack_headlines>
Expected output
<attack_headlines version="1.0.1"><attack_headline><site_id>1</site_id<category>V2luZG93cyBEaXJlY3RvcmllcyBhbmQgRmlsZXM=</category<subcategory>SUlTIEhlbHA=</subcategory><client_ip>172.17.1.156</client_ip<date>1363735975</date><gmt_diff>0</gmt_diff<reference_id>6D13-DE3D-9539-8982</reference_id></attack_headline</attack_headlines>
Thanks in advance!
import json
import os
from suds.transport.https import WindowsHttpAuthenticated
class Helpers:
def set_connection(self,conf):
#SUDS BUG FIXER(doctor)
protocol=conf['protocol']
hostname=conf['hostname']
port=conf['port']
path=conf['path']
file=conf['file']
u_name=conf['login']
passwrd=conf['password']
auth_type = conf['authType']
from suds.xsd.doctor import ImportDoctor, Import
from suds.client import Client
url = '{0}://{1}:{2}/{3}/{4}?wsdl'.format(protocol,
hostname,port, path, file)
imp = Import('http://schemas.xmlsoap.org/soap/encoding/')
d = ImportDoctor(imp)
if(auth_type == 'ntlm'):
ntlm = WindowsHttpAuthenticated(username=u_name, password=passwrd)
client = Client(url, transport=ntlm, doctor=d)
else:
client = Client(url, username=u_name, password=passwrd, doctor=d)
return client
def read_from_file(self, filename):
try:
fo = open(filename, "r")
try:
result = fo.read()
finally:
fo.close()
return result
except IOError:
print "##Error opening/reading file {0}".format(filename)
exit(-1)
def read_json(self,filename):
string=self.read_from_file(filename)
return json.loads(string)
def get_recent_attacks(self, client):
import time
import base64
from xml.dom.minidom import parseString
epoch_time_now = int(time.time())
epochtimeread = open('epoch_last', 'r')
epoch_time_last_read = epochtimeread.read()
epochtimeread.close()
epoch_time_last = int(float(epoch_time_last_read))
print client.service.get_recent_attacks("",epoch_time_last,epoch_time_now,1,"",15)
If this is just a single, large string object with line-breaks, you can simply delete them:
import re
text = re.sub('\s*\n\s*', '', text)
To leave the line breaks in that follow the </attack_headline> delimiter, try:
re.sub('(?<!<\/attack_headline>)\s*\n\s*', '', x)
You could use:
oneline = "".join(multiline.split())
Edit 1 (I've just seen your edit) - I will change your code like this:
with open(filename, "r") as fo:
result = []
for line in fo.readlines():
result.append(line.strip())
return result
Edit 2 (I've read your comment on the other answer) - You could do like this:
with open(filename, "r") as fo:
partial = []
for line in fo.readlines():
if line.startswith("<"):
yield "".join(partial)
partial = []
else:
clean = line.strip()
if clean:
partial.append(clean)
import re
# remove all newline whitespace stuff as in answer given before:
text = re.sub(r'\s*\n\s*', '', text)
# break again at desired points:
text = re.sub(r'</attack_headlines>', '</attack_headlines>\n', text)

Categories