I am using some sample code (below) to test a NB classifier and Im getting the following error from line 22:
_csv.Error: new-line character seen in unquoted field - do you need to open the file in universal-newline mode?
This is a sample row of the csv file:
b8:27:eb:38:72:a7,df598b5eb8f4,5/9/16 14:47,154aec250ef6,-84,outside
sample of code:
from sklearn.preprocessing import LabelBinarizer
import numpy as np
from sklearn import naive_bayes
import csv
import random
from sklearn import metrics
import urllib
url = "example.com"
webpage = urllib.urlopen(url)
# download the file
#raw_data = urllib.urlopen(url)
datareader = csv.reader(webpage) #line 22 is this one
ct = 0;
for row in datareader:
ct = ct+1
webpage = urllib.urlopen(url)
datareader = csv.reader(webpage)
data = np.array(-1*np.ones((ct,6),float),object);
k=0;
for row in datareader:
data[k,:] = np.array(row)
k = k+1;
featnames = np.array(['unti','dongle','timestamp','tracker','rssi','label'],str)
keys = [[]]*np.size(data,1)
numdata = -1*np.ones_like(data);
for k in range(np.size(data,1)):
keys[k],garbage,numdata[:k] = np.unique(data[:,k],True,True)
numrows = np.size(numdata,0);
numcols = np.size(numdata,1);
numdata = np.array(numdata, int)
xdata = numdata[:,:-1]
ydata = numdata[:,-1]
lbin = LabelBinarizer();
for k in range(np.size(xdata,1)):
if k==0:
xdata_ml = lbin.fit_transform(xdata[:,k]);
else:
xdata_ml = np.hstack((xdata_ml,lbin.fit_transform(xdata[:,k])))
ydata_ml = lbin.fit_transform(ydata)
allIDX = np.arrange(numrows);
random.shuffle(allIDX);
holdout_number = numrows/10;
testIDX = allIDX[0:holdout_number];
trainIDX = allIDX[holdout_number:];
xtest = xdata_ml[testIDX,:];
xtrain = xdata_ml[trainIDX,:];
ytest = ydata[testIDX];
ytrain = ydata[trainIDX];
mnb = naive_bayes.MultinomialNB();
mnb.fit(xtrain,ytrain);
print "Classification accuracy of MNB =", mnb.score(xtest,ytest)
Can anyone help me find the error and suggest a fix?
Are you using windows? If yes, this can be solved by:
datareader = csv.reader(webpage, dialect=csv.excel_tab)
Some of the answers here CSV new-line character seen in unquoted field error refer to CSV in MAC
Can you try to manually download the file to your MAC and try to do the following with the file as local file:
1) Save the file as CSV (MS-DOS Comma-Separated)
2) Save the file as CSV (Windows Comma-Separated)
3) Run the following script
with open(csv_filename, 'rU') as csvfile:
csvreader = csv.reader(csvfile)
for row in csvreader:
print ', '.join(row)
explanation about 'ru': https://www.python.org/dev/peps/pep-0278/
In a Python with universal newline support open() the mode parameter can also be "U", meaning "open for input as a text file with universal newline interpretation". Mode "rU" is also allowed, for symmetry with "rb"
Rationale
Universal newline support is implemented in C, not in Python.
This is done because we want files with a foreign newline
convention to be import-able, so a Python Lib directory can be
shared over a remote file system connection, or between MacPython
and Unix-Python on Mac OS X
Related
The place where I put this code belongs to an import page.And here there is data in the data I want to import in .txt format, but this data contains the \n character.
if request.method == "POST":
txt_file = request.FILES['file']
if not txt_file .name.endswith('.txt'):
messages.info(request,'This is not a txt file')
data_set = csv_file.read().decode('latin-1')
io_string = io.StringIO(data_set)
next(io_string)
csv_reader = csv.reader(io_string, delimiter='\t',quotechar="|")
for column in csv_reader:
b = Module_Name(
user= request.user,
a = column[1],
b = column[2],
c = column[3],
d = column[4],
e = column[5],
f = column[6],
g = column[7],
h = column[8],
)
b.save()
messages.success(request,"Successfully Imported...")
return redirect("return:return_import")
This can be called the full version of my code. To explain, there is a \n character in the data that comes here as column[1]. This file is a .txt file from another export. And in this export column[1];
This is
a value
and my django localhost new-line character seen in unquoted field - do you need to open the file in universal-newline mode? gives a warning and aborts the import to the system.
the csv reader iterates over rows, not columns. So if you want to append the data from a given column together, you must iterate over all the rows first. For example:
import csv
from io import StringIO
io_string = "this is , r0 c1\r\na value, r1 c2\r\n"
io_string = StringIO(io_string)
rows = csv.reader(io_string)
column_0_data = []
for row in rows:
column_0_data.append(row[0])
print("".join(column_0_data))
the rest of your code looks iffy to me, but that is off topic.
I am trying to extract tweets with Python and store them in a CSV file, but I can't seem to include all languages. Arabic appears as special characters.
def recup_all_tweets(screen_name,api):
all_tweets = []
new_tweets = api.user_timeline(screen_name,count=300)
all_tweets.extend(new_tweets)
#outtweets = [[tweet.id_str, tweet.created_at, tweet.text,tweet.retweet_count,get_hashtagslist(tweet.text)] for tweet in all_tweets]
outtweets = [[tweet.text,tweet.entities['hashtags']] for tweet in all_tweets]
# with open('recup_all_tweets.json', 'w', encoding='utf-8') as f:
# f.write(json.dumps(outtweets, indent=4, sort_keys=True))
with open('recup_all_tweets.csv', 'w',encoding='utf-8') as f:
writer = csv.writer(f,delimiter=',')
writer.writerow(["text","tag"])
writer.writerows(outtweets)
# pass
return(outtweets)
Example of writing both CSV and JSON:
#coding:utf8
import csv
import json
s = ['عربى','عربى','عربى']
with open('output.csv','w',encoding='utf-8-sig',newline='') as f:
r = csv.writer(f)
r.writerow(['header1','header2','header3'])
r.writerow(s)
with open('output.json','w',encoding='utf8') as f:
json.dump(s,f,ensure_ascii=False)
output.csv:
header1,header2,header3
عربى,عربى,عربى
output.csv viewed in Excel:
output.json:
["عربى", "عربى", "عربى"]
Note Microsoft Excel needs utf-8-sig to read a UTF-8 file properly. Other applications may or may not need it to view properly. Many Windows applications required a UTF-8 "BOM" signature at the start of a text file or will assume an ANSI encoding instead. The ANSI encoding varies depending on the localized version of Windows used.
Maybe try with
f.write(json.dumps(outtweets, indent=4, sort_keys=True, ensure_ascii=False))
I searched a lot and finally wrote the following piece of code:
import arabic_reshaper
from bidi.algorithm import get_display
import numpy as np
itemsX = webdriver.find_elements(By.CLASS_NAME,"x1i10hfl")
item_linksX = [itemX.get_attribute("href") for itemX in itemsX]
item_linksX = filter(lambda k: '/p/' in k, item_linksX)
counter = 0
for item_linkX in item_linksX:
AllComments2 = []
counter = counter + 1
webdriver.get(item_linkX)
print(item_linkX)
sleep(11)
comments = webdriver.find_elements(By.CLASS_NAME,"_aacl")
for comment in comments:
try:
reshaped_text = arabic_reshaper.reshape(comment.text)
bidi_text = get_display(reshaped_text)
AllComments2.append(reshaped_text)
except:
pass
df = pd.DataFrame({'col':AllComments2})
df.to_csv('C:\Crawler\Comments' + str(counter) + '.csv', sep='\t', encoding='utf-16')
This code worked perfectly for me. I hope it helps those who haven't used the code from the previous post
I want this output written via CSV
['https://www.lendingclub.com/loans/personal-loans' '6.16% to 35.89%']
['https://www.lendingclub.com/loans/personal-loans' '1% to 6%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.marcus.com/us/en/personal-loans' '6.99% to 24.99%']
['https://www.discover.com/personal-loans/' '6.99% to 24.99%']
However when I run the code to write the output to CSV I only get the last line written to the CSV file:
['https://www.discover.com/personal-loans/' '6.99% to 24.99%']
Could it be because my printed output is not comma separated? I attempted to circumvent having to put a comma in there by using a space as the delimiter. Let me know your thoughts. Would love some help on this because I am having the hardest time reshaping this collected data.
plcompetitors = ['https://www.lendingclub.com/loans/personal-loans',
'https://www.marcus.com/us/en/personal-loans',
'https://www.discover.com/personal-loans/']
#cycle through links in array until it finds APR rates/fixed or variable using regex
for link in plcompetitors:
cdate = datetime.date.today()
l = r.get(link)
l.encoding = 'utf-8'
data = l.text
soup = bs(data, 'html.parser')
#captures Discover's rate perfectly but catches too much for lightstream/prosper
paragraph = soup.find_all(text=re.compile('[0-9]%'))
for n in paragraph:
matches = re.findall('(?i)\d+(?:\.\d+)?%\s*(?:to|-)\s*\d+(?:\.\d+)?%', n.string)
try:
irate = str(matches[0])
array = np.asarray(irate)
array2 = np.append(link,irate)
array2 = np.asarray(array2)
print(array2)
#with open('test.csv', "w") as csv_file:
# writer = csv.writer(csv_file, delimiter=' ')
# for line in test:
# writer.writerow(line)
except IndexError:
pass
When it comes to using csv file, pandas comes handy.
import datetime
import requests as r
from bs4 import BeautifulSoup as bs
import numpy as np
import regex as re
import pandas as pd
plcompetitors = ['https://www.lendingclub.com/loans/personal-loans',
'https://www.marcus.com/us/en/personal-loans',
'https://www.discover.com/personal-loans/']
df = pd.DataFrame({'Link':[],'APR Rate':[]})
#cycle through links in array until it finds APR rates/fixed or variable using regex
for link in plcompetitors:
cdate = datetime.date.today()
l = r.get(link)
l.encoding = 'utf-8'
data = l.text
soup = bs(data, 'html.parser')
#captures Discover's rate perfectly but catches too much for lightstream/prosper
paragraph = soup.find_all(text=re.compile('[0-9]%'))
for n in paragraph:
matches = re.findall('(?i)\d+(?:\.\d+)?%\s*(?:to|-)\s*\d+(?:\.\d+)?%', n.string)
irate = ''
try:
irate = str(matches[0])
df2 = pd.DataFrame({'Link':[link],'APR Rate':[irate]})
df = pd.concat([df,df2],join="inner")
except IndexError:
pass
df.to_csv('CSV_File.csv',index=False)
I have stored each link and it's irate value in a data frame df2 and I concatenate it to parent data frame df.
At the end, I write parent data frame df to a csv file.
I think the problem is that you are opening the file in write-mode (the "w" in open('test.csv', "w")), meaning that Python overwrites what's already written in the file. I think you're looking for append-mode:
# open the file before the loop, and close it after
csv_file = open("test.csv", 'a') # change the 'w' to an 'a'
csv_file.truncate(0) # clear the contents of the file
writer = csv.writer(csv_file, delimiter=' ') # make the writer beforehand for efficiency
for n in paragraph:
matches = re.findall('(?i)\d+(?:\.\d+)?%\s*(?:to|-)\s*\d+(?:\.\d+)?%', n.string)
try:
irate = str(matches[0])
array = np.asarray(irate)
array2 = np.append(link,irate)
array2 = np.asarray(array2)
print(array2)
for line in test:
writer.writerow(line)
except IndexError:
pass
# close the file
csv_file.close()
If this doesn't work, please let me know!
I have problem when i was try to import .csv file. I was try to convert image to base64 and also i was try to create barcode by name csv file. The image it's success convert into base64 but the problem when i was try to create barcode by csv file name, i was always get error like :
Unknown error during import: <class 'openerp.exceptions.ValidationError'>: ('ValidateError', u'Field(s) `ean13` failed against a constraint: You provided an invalid "EAN13 Barcode" reference. You may use the "Internal Reference" field instead.') at row 2 Resolve other errors first
And this is my code:
files = []
text = ''"
data_text3 = []
header_column2 = ["id","product_variant_ids/ean13_barcode", "product_variant_ids/ean13", "ean13", "image", "ean13_barcode", "default_code", "product_variant_ids/default_code"]
number = 1 for file in os.listdir("gmbr/"):
file_name = os.path.splitext(file)[0]
for n in str(number):
directory_file = "gmbr/"+str(file)
img = open(directory_file, 'rb').read()
img_64 = base64.encodestr
text = str(number)+","+str(name_product)+","+str(file_name)+","+str(file_name)+","+str(img_64+","+" "+","+" "+","+" ")
number += 1
data_text3.append(text)
with open('sample2.csv', 'wb') as f:
writer = csv.writer(f, delimiter='\t', dialect='excel')
writer.writerow(header_column2)
for l in data_text3:
writer.writerow(l.split(','))
I am trying to performing text analysis on Chinese texts. The program is provided below. I got the result with unreadable characters such as 浜烘皯鏃ユ姤绀捐. And if I change the output file result.csv to result.txt, the characters are correct as 人民日报社论. So what's wrong with this? I can not figure out. I tried several ways including add decoder and encoder.
# -*- coding: utf-8 -*-
import os
import glob
import jieba
import jieba.analyse
import csv
import codecs
segList = []
raw_data_path = 'monthly_raw_data/'
file_name = ["201010", "201011", "201012", "201101", "201103", "201105", "201107", "201109", "201110", "201111", "201112", "201201", "201202", "201203", "201205", "201206", "201208", "201210", "201211"]
jieba.load_userdict("customized_dict.txt")
for name in file_name:
all_text = ""
multi_line_text = ""
with open(raw_data_path + name + ".txt", "r") as file:
for line in file:
if line != '\n':
multi_line_text += line
templist = multi_line_text.split('\n')
for text in templist:
all_text += text
seg_list = jieba.cut(all_text,cut_all=False)
temp_text = []
for item in seg_list:
temp_text.append(item.encode('utf-8'))
stop_list = []
with open("stopwords.txt", "r") as stoplistfile:
for item in stoplistfile:
stop_list.append(item.rstrip('\r\n'))
text_without_stopwords = []
for word in temp_text:
if word not in stop_list:
text_without_stopwords.append(word)
segList.append(text_without_stopwords)
with open("results/result.csv", 'wb') as f:
writer = csv.writer(f)
writer.writerows(segList)
For UTF-8 encoding, Excel requires a BOM (byte order mark) codepoint written at the start of the file or it will assume an ANSI encoding, which is locale-dependent. U+FEFF is the Unicode BOM. Here's an example that will open in Excel correctly:
#!python2
#coding:utf8
import csv
data = [[u'American', u'美国人'],
[u'Chinese', u'中国人']]
with open('results.csv','wb') as f:
f.write(u'\ufeff'.encode('utf8'))
w = csv.writer(f)
for row in data:
w.writerow([item.encode('utf8') for item in row])
Python 3 makes this easier. Use 'w', newline='', encoding='utf-8-sig' parameters instead of 'wb' which will accept Unicode strings directly and automatically write a BOM:
#!python3
#coding:utf8
import csv
data = [['American', '美国人'],
['Chinese', '中国人']]
with open('results.csv', 'w', newline='', encoding='utf-8-sig') as f:
w = csv.writer(f)
w.writerows(data)
There is also a 3rd–party unicodecsv module that makes Python 2 easier to use as well:
#!python2
#coding:utf8
import unicodecsv
data = [[u'American', u'美国人'],
[u'Chinese', u'中国人']]
with open('results.csv', 'wb') as f:
w = unicodecsv.writer(f ,encoding='utf-8-sig')
w.writerows(data)
Here is another way kinda tricky:
#!python2
#coding:utf8
import csv
data = [[u'American',u'美国人'],
[u'Chinese',u'中国人']]
with open('results.csv','wb') as f:
f.write(u'\ufeff'.encode('utf8'))
w = csv.writer(f)
for row in data:
w.writerow([item.encode('utf8') for item in row])
This code block generate csv file encoded utf-8 .
open file with notepad++ (or other Editor with encode feature)
Encoding -> convert to ANSI
save
Open file with Excel, it's OK.