I am trying to extract tweets with Python and store them in a CSV file, but I can't seem to include all languages. Arabic appears as special characters.
def recup_all_tweets(screen_name,api):
all_tweets = []
new_tweets = api.user_timeline(screen_name,count=300)
all_tweets.extend(new_tweets)
#outtweets = [[tweet.id_str, tweet.created_at, tweet.text,tweet.retweet_count,get_hashtagslist(tweet.text)] for tweet in all_tweets]
outtweets = [[tweet.text,tweet.entities['hashtags']] for tweet in all_tweets]
# with open('recup_all_tweets.json', 'w', encoding='utf-8') as f:
# f.write(json.dumps(outtweets, indent=4, sort_keys=True))
with open('recup_all_tweets.csv', 'w',encoding='utf-8') as f:
writer = csv.writer(f,delimiter=',')
writer.writerow(["text","tag"])
writer.writerows(outtweets)
# pass
return(outtweets)
Example of writing both CSV and JSON:
#coding:utf8
import csv
import json
s = ['عربى','عربى','عربى']
with open('output.csv','w',encoding='utf-8-sig',newline='') as f:
r = csv.writer(f)
r.writerow(['header1','header2','header3'])
r.writerow(s)
with open('output.json','w',encoding='utf8') as f:
json.dump(s,f,ensure_ascii=False)
output.csv:
header1,header2,header3
عربى,عربى,عربى
output.csv viewed in Excel:
output.json:
["عربى", "عربى", "عربى"]
Note Microsoft Excel needs utf-8-sig to read a UTF-8 file properly. Other applications may or may not need it to view properly. Many Windows applications required a UTF-8 "BOM" signature at the start of a text file or will assume an ANSI encoding instead. The ANSI encoding varies depending on the localized version of Windows used.
Maybe try with
f.write(json.dumps(outtweets, indent=4, sort_keys=True, ensure_ascii=False))
I searched a lot and finally wrote the following piece of code:
import arabic_reshaper
from bidi.algorithm import get_display
import numpy as np
itemsX = webdriver.find_elements(By.CLASS_NAME,"x1i10hfl")
item_linksX = [itemX.get_attribute("href") for itemX in itemsX]
item_linksX = filter(lambda k: '/p/' in k, item_linksX)
counter = 0
for item_linkX in item_linksX:
AllComments2 = []
counter = counter + 1
webdriver.get(item_linkX)
print(item_linkX)
sleep(11)
comments = webdriver.find_elements(By.CLASS_NAME,"_aacl")
for comment in comments:
try:
reshaped_text = arabic_reshaper.reshape(comment.text)
bidi_text = get_display(reshaped_text)
AllComments2.append(reshaped_text)
except:
pass
df = pd.DataFrame({'col':AllComments2})
df.to_csv('C:\Crawler\Comments' + str(counter) + '.csv', sep='\t', encoding='utf-16')
This code worked perfectly for me. I hope it helps those who haven't used the code from the previous post
Related
My program takes a csv file as input and writes it as an output file in json format. On the final line, I use the print command to output the contents of the json format file to the screen. However, it does not print out the json file contents and I don't understand why.
Here is my code that I have so far:
import csv
import json
def jsonformat(infile,outfile):
contents = {}
csvfile = open(infile, 'r')
reader = csvfile.read()
for m in reader:
key = m['No']
contents[key] = m
jsonfile = open(outfile, 'w')
jsonfile.write(json.dumps(contents))
csvfile.close()
jsonfile.close()
return jsonfile
infile = 'orders.csv'
outfile = 'orders.json'
output = jsonformat(infile,outfile)
print(output)
Your function returns the jsonfile variable, which is a file.
Try adding this:
jsonfile.close()
with open(outfile, 'r') as file:
return file.read()
Your function returns a file handle to the file jsonfile that you then print. Instead, return the contents that you wrote to that file. Since you opened the file in w mode, any previous contents are removed before writing the new contents, so the contents of your file are going to be whatever you just wrote to it.
In your function, do:
def jsonformat(infile,outfile):
...
# Instead of this:
# jsonfile.write(json.dumps(contents))
# do this:
json_contents = json.dumps(contents, indent=4) # indent=4 to pretty-print
jsonfile.write(json_contents)
...
return json_contents
Aside from that, you aren't reading the CSV file the correct way. If your file has a header, you can use csv.DictReader to read each row as a dictionary. Then, you'll be able to use for m in reader: key = m['No']. Change reader = csvfile.read() to reader = csv.DictReader(csvfile)
As of now, reader is a string that contains all the contents of your file. for m in reader makes m each character in this string, and you cannot access the "No" key on a character.
a_file = open("sample.json", "r")
a_json = json.load(a_file)
pretty_json = json.dumps(a_json, indent=4)
a_file.close()
print(pretty_json)
Using this sample to print the contents of your json file. Have a good day.
json.decoder.JSONDecodeError: Unterminated string starting at: line 1 column 67 (char 66)
Python cannot recognize some characters in JSON. I tried a lot of encoding, but it still cannot be displayed correctly. The JSON file is very large, I want to skip unrecognized lines and continue working
Below is my code, help me modify it. Use parameters 'ignore'
# -*-coding:utf-8-*-
import csv
import json
import sys
import codecs
def trans(path):
jsonData = codecs.open('C:/Users/jeri/Desktop/1.json', 'r', encoding='utf-8')
# csvfile = open(path+'.csv', 'w')
# csvfile = open(path+'.csv', 'wb')
csvfile = open('C:/Users/jeri/Desktop/1.csv', 'w', encoding='utf-8',
newline='')
writer = csv.writer(csvfile, delimiter=',')
flag = True
for line in jsonData:
dic = json.loads(line)
if flag:
keys = list(dic.keys())
print(keys)
writer.writerow(keys)
flag = False
writer.writerow(list(dic.values()))
jsonData.close()
csvfile.close()
if __name__ == '__main__':
path = str(sys.argv[0])
print(path)
trans(path)
try to create a brand new file and paste your json data there and change extension to .json try to import that file and see if that fix the error
I am writing a simple function to save a twitter search as a JSON, and then load the results. The save function seems to work but the load one doesn't. The error I receive is:
"UnsupportedOperation: not readable"
Can you please advise what the issue might be in my script?
import io
def save_json(filename, data):
with open('tweet2.json', 'w', encoding='utf8') as file:
json.dump(data, file, ensure_ascii = False)
def load_json(filename):
with open('tweet2.json', 'w', encoding = 'utf8') as file:
return json.load(file)
#sample usage
q = 'Test'
results = twitter_search(twitter_api, q, max_results = 10)
save_json = (q, results)
results = load_json(q)
print(json.dumps(results, indent = 1, ensure_ascii = False))
Using "w" you won't be able to read the file so you need to use "r" (Opens a file for reading only.)
open("tweet2.json","r")
There are two CSV files. I need to convert to JSON. Code is below
import csv
import json
import os
import glob
os.chdir(r'C:\Users\user\Desktop\test' )
result = glob.glob( '*.csv' )
print (result)
def make_json(csvFile, jsonFile):
csvFile, jsonFile = '',''
for i in result:
data = {}
with open(csvFile, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
key = rows['id']
data[key] = rows
with open(jsonFile, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
csvFilePath =f"{i}"
jsonFilePath =f"{i.split('.')[-2]}.json"
make_json(csvFile, jsonFile)
I got error > csvFile is not mentioned. But the third line from the end mentions the CSV file.
Disclaimer. Please find the error in the code. I already know of the working code which is in pandas
Below is the correct code, but I would recommend you learn to use the python debugger so you can resolve any logic flaws in your code next time. Documentation on the python debugger can be found here:
https://docs.python.org/3/library/pdb.html
Your code was structured in a way that meant for each csv file, you were not setting the file name until after you attempted to open it. The immediate error you saw was caused because you tried to call make_json() before you defined the values for csvFile and jsonFile.
I would recommend changing the code to:
import csv
import json
import glob
def make_json(csvList):
for csvFile in csvList:
data = {}
with open(csvFile, encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
key = rows['id']
data[key] = rows
jsonFile =f"{csvFile.split('.')[-2]}.json"
with open(jsonFile, 'w', encoding='utf-8') as jsonf:
jsonf.write(json.dumps(data, indent=4))
make_json(glob.glob('*.csv'))
You should try this
import csv, json, os, glob
os.chdir(r'C:\Users\user\Desktop\test' )
result = glob.glob( '*.csv' )
print(result)
def make_json():
for i in result:
with open(i, encoding='utf-8') as csvf:
data = [row for row in csv.DictReader(csvf)]
with open(f"{i.split('.')[-2]}.json", 'w', encoding='utf-8') as jsonf:
json.dump(data, jsonf)
make_json()
You did not initialize both the arguments of make_json() - (csvFilePath & jsonFilePath)
I am trying to performing text analysis on Chinese texts. The program is provided below. I got the result with unreadable characters such as 浜烘皯鏃ユ姤绀捐. And if I change the output file result.csv to result.txt, the characters are correct as 人民日报社论. So what's wrong with this? I can not figure out. I tried several ways including add decoder and encoder.
# -*- coding: utf-8 -*-
import os
import glob
import jieba
import jieba.analyse
import csv
import codecs
segList = []
raw_data_path = 'monthly_raw_data/'
file_name = ["201010", "201011", "201012", "201101", "201103", "201105", "201107", "201109", "201110", "201111", "201112", "201201", "201202", "201203", "201205", "201206", "201208", "201210", "201211"]
jieba.load_userdict("customized_dict.txt")
for name in file_name:
all_text = ""
multi_line_text = ""
with open(raw_data_path + name + ".txt", "r") as file:
for line in file:
if line != '\n':
multi_line_text += line
templist = multi_line_text.split('\n')
for text in templist:
all_text += text
seg_list = jieba.cut(all_text,cut_all=False)
temp_text = []
for item in seg_list:
temp_text.append(item.encode('utf-8'))
stop_list = []
with open("stopwords.txt", "r") as stoplistfile:
for item in stoplistfile:
stop_list.append(item.rstrip('\r\n'))
text_without_stopwords = []
for word in temp_text:
if word not in stop_list:
text_without_stopwords.append(word)
segList.append(text_without_stopwords)
with open("results/result.csv", 'wb') as f:
writer = csv.writer(f)
writer.writerows(segList)
For UTF-8 encoding, Excel requires a BOM (byte order mark) codepoint written at the start of the file or it will assume an ANSI encoding, which is locale-dependent. U+FEFF is the Unicode BOM. Here's an example that will open in Excel correctly:
#!python2
#coding:utf8
import csv
data = [[u'American', u'美国人'],
[u'Chinese', u'中国人']]
with open('results.csv','wb') as f:
f.write(u'\ufeff'.encode('utf8'))
w = csv.writer(f)
for row in data:
w.writerow([item.encode('utf8') for item in row])
Python 3 makes this easier. Use 'w', newline='', encoding='utf-8-sig' parameters instead of 'wb' which will accept Unicode strings directly and automatically write a BOM:
#!python3
#coding:utf8
import csv
data = [['American', '美国人'],
['Chinese', '中国人']]
with open('results.csv', 'w', newline='', encoding='utf-8-sig') as f:
w = csv.writer(f)
w.writerows(data)
There is also a 3rd–party unicodecsv module that makes Python 2 easier to use as well:
#!python2
#coding:utf8
import unicodecsv
data = [[u'American', u'美国人'],
[u'Chinese', u'中国人']]
with open('results.csv', 'wb') as f:
w = unicodecsv.writer(f ,encoding='utf-8-sig')
w.writerows(data)
Here is another way kinda tricky:
#!python2
#coding:utf8
import csv
data = [[u'American',u'美国人'],
[u'Chinese',u'中国人']]
with open('results.csv','wb') as f:
f.write(u'\ufeff'.encode('utf8'))
w = csv.writer(f)
for row in data:
w.writerow([item.encode('utf8') for item in row])
This code block generate csv file encoded utf-8 .
open file with notepad++ (or other Editor with encode feature)
Encoding -> convert to ANSI
save
Open file with Excel, it's OK.