python display unicode in html - python

I'm writing script to export my links and their titles from chrome to html.
Chrome bookmarks stored as json, in utf encoding
Some titles are on Russian therefore they stored like that:
"name": "\u0425\u0430\u0431\u0440\ ..."
import codecs
f = codecs.open("chrome.json","r", "utf-8")
data = f.readlines()
urls = [] # for links
names = [] # for link titles
ind = 0
for i in data:
if i.find('"url":') != -1:
urls.append(i.split('"')[3])
names.append(data[ind-2].split('"')[3])
ind += 1
fw = codecs.open("chrome.html","w","utf-8")
fw.write("<html><body>\n")
for n in names:
fw.write(n + '<br>')
# print type(n) # this will return <type 'unicode'> for each url!
fw.write("</body></html>")
Now, in chrome.html I got those displayed as \u0425\u0430\u0431...
How I can turn them back to Russian?
using python 2.5
**Edit: Solved!**
s = '\u041f\u0440\u0438\u0432\u0435\u0442 world!'
type(s)
<type 'str'>
print s.decode('raw-unicode-escape').encode('utf-8')
Привет world!
That's what I needed, to convert str of \u041f... into unicode.
f = open("chrome.json", "r")
data = f.readlines()
f.close()
urls = [] # for links
names = [] # for link titles
ind = 0
for i in data:
if i.find('"url":') != -1:
urls.append(i.split('"')[3])
names.append(data[ind-2].split('"')[3])
ind += 1
fw = open("chrome.html","w")
fw.write("<html><body>\n")
for n in names:
fw.write(n.decode('raw-unicode-escape').encode('utf-8') + '<br>')
fw.write("</body></html>")

By the way, it's not just Russian; non-ASCII characters are quite common in page names. Example:
name=u'Python Programming Language \u2013 Official Website'
url=u'http://www.python.org/'
As an alternative to fragile code like
urls.append(i.split('"')[3])
names.append(data[ind-2].split('"')[3])
# (1) relies on name being 2 lines before url
# (2) fails if there is a `"` in the name
# example: "name": "The \"Fubar\" website",
you could process the input file using the json module. For Python 2.5, you can get simplejson.
Here's a script that emulates yours:
try:
import json
except ImportError:
import simplejson as json
import sys
def convert_file(infname, outfname):
def explore(folder_name, folder_info):
for child_dict in folder_info['children']:
ctype = child_dict.get('type')
name = child_dict.get('name')
if ctype == 'url':
url = child_dict.get('url')
# print "name=%r url=%r" % (name, url)
fw.write(name.encode('utf-8') + '<br>\n')
elif ctype == 'folder':
explore(name, child_dict)
else:
print "*** Unexpected ctype=%r ***" % ctype
f = open(infname, 'rb')
bmarks = json.load(f)
f.close()
fw = open(outfname, 'w')
fw.write("<html><body>\n")
for folder_name, folder_info in bmarks['roots'].iteritems():
explore(folder_name, folder_info)
fw.write("</body></html>")
fw.close()
if __name__ == "__main__":
convert_file(sys.argv[1], sys.argv[2])
Tested using Python 2.5.4 on Windows 7 Pro.

It's a JSON file, so read it using a JSON parser. That will give you a Unicode string directly, without you having to unescape it. This is going to be much more reliable (as well as simpler), since JSON strings are not the same format as Python strings.
(They're pretty similar and both use the \u format, but your current code will fall over badly for other escaped characters, not to mention that it relies on the exact attribute order and whitespace settings of a JSON file, which makes it very fragile indeed.)
import json, cgi, codecs
with open('chrome.json') as fp:
bookmarks= json.load(fp)
with codecs.open('chrome.html', 'w', 'utf-8') as fp:
fp.write(u'<html><body>\n')
for root in bookmarks[u'roots'].values():
for child in root['children']:
fp.write(u'%s' % (
cgi.escape(child[u'url']),
cgi.escape(child[u'name'])
))
fp.write(u'</body></html>')
Note also the use of cgi.escape to HTML-encode any < or & characters in the strings.

I'm not sure where you're trying to display the russian text, but in the interpreter you can do the following to see the Russian text:
s = '\u0425\u0430\u0431'
l = s.split('\u')
l.remove('')
for x in l:
print(unichr(int(x, 16))),
This will give the following output:
Х а б
If you're storing it in html, better off to leave it as '\u0425...' until you need to convert it.
Hope this helps.

You could include the utf-8 BOM, so chrome knows to read it as utf-8, not ascii:
fw = codecs.open("chrome.html","w","utf-8")
fw.write(codecs.BOM_UTF8.decode('utf-8'))
fw.write(u'你好')
Oh, but if you open fw in python, remember to use 'utf-8-sig' to strip the BOM.
Maybe you need to encode the unicode into utf-8, but I think codecs does that already, right:

Related

Saving text with Polish characters (utf-8) to a textfile from JSON in Python

I am trying to save a conversation from Messenger to a textfile, including things like timestamps and senders.
In the JSON file downloaded from Messenger, the emojis and Polish characters are displayed as UTF-8 in literal (e.g. "ą" as \xc4\x85).
After executing this program:
import json
from datetime import datetime
messages = []
jsonfiles = ["message_1.json","message_2.json","message_3.json","message_4.json","message_5.json", "message_6.json","message_7.json","message_8.json","message_9.json","message_10.json","message_11.json"]
def filldict(textfile,jsonfile):
with open(textfile,"a", encoding="utf-8") as w:
with open(jsonfile, "r", encoding="utf-8") as j:
data = json.load(j)
i = 0
while i<len(data["messages"]):
message = {}
if "content" in data["messages"][len(data["messages"])-1-i]:
stamp = int(data["messages"][len(data["messages"])-1-i]["timestamp_ms"])
date = datetime.fromtimestamp(stamp/1000)
message['timestamp']=stamp
message['date']=date
w.write(str(date))
w.write(" ")
w.write(data["messages"][len(data["messages"])-1-i]["sender_name"])
message['sender']=data["messages"][len(data["messages"])-1-i]["sender_name"]
w.write(": ")
if "content" in str(data["messages"][len(data["messages"])-1-i]):
w.write(data["messages"][len(data["messages"])-1-i]["content"])
message['content']=data["messages"][len(data["messages"])-1-i]["content"]
w.write("\n")
i +=1
messages.append(message)
message = {}
j = len(jsonfiles)
while j>0:
filldict("messages11.txt", jsonfiles[j-1])
j-=1
print("process finished")
the output textfile contains those utf-8 literals instead of the characters which they represent. What can I do in order to fix it and display the Polish characters (and, if that's even possible, emojis) in the textfile? I thought that including " encoding = 'utf-8' " would be enough. Thank you for any clues.

write list of paragraph tuples to a csv file

The following code is designed to write a tuple, each containing a large paragraph of text, and 2 identifiers behind them, to a single line per each entry.
import urllib2
import json
import csv
base_url = "https://www.eventbriteapi.com/v3/events/search/?page={}
writer = csv.writer(open("./data/events.csv", "a"))
writer.writerow(["description", "category_id", "subcategory_id"])
def format_event(event):
return event["description"]["text"].encode("utf-8").rstrip("\n\r"), event["category_id"], event["subcategory_id"]
for x in range(1, 2):
print "fetching page - {}".format(x)
formatted_url = base_url.format(str(x))
resp = urllib2.urlopen(formatted_url)
data = resp.read()
j_data = json.loads(data)
events = map(format_event, j_data["events"])
for event in events:
#print event
writer.writerow(event)
print "wrote out events for page - {}".format(x)
The ideal format would be to have each line contain a single paragraph, followed by the other fields listed above, yet here is a screenshot of how the data comes out.
If instead I this line to the following:
writer.writerow([event])
Here is how the file now looks:
It certainly looks much closer to what I want, but its got parenthesis around each entry which are undesirable.
EDIT
here is a snippet that contains a sample of the data Im working with.
Can you try writing to the CSV file directly without using using the csv module? You can write/append comma-delimited strings to the CSV file just like writing to typical text files. Also, the way you deal with removing \r and \n characters might not be working. You can use regex to find those characters and replace them with an empty string "":
import urllib2
import json
import re
base_url = "https://www.eventbriteapi.com/v3/events/search/?page={}"
def format_event(event):
ws_to_strip = re.compile(r"(\r|\n)")
description = re.sub(ws_to_strip, "", event["description"]["text"].encode("utf-8"))
return [description, event["category_id"], event["subcategory_id"]]
with open("./data/events.csv", "a") as events_file:
events_file.write(",".join(["description", "category_id", "subcategory_id"]))
for x in range(1, 2):
print "fetching page - {}".format(x)
formatted_url = base_url.format(str(x))
resp = urllib2.urlopen(formatted_url)
data = resp.read()
j_data = json.loads(data)
events = map(format_event, j_data["events"])
for event in events:
events_file.write(",".join(event))
print "wrote out events for page - {}".format(x)
Change your csv writer to be DictWriter.
Make a few tweaks:
def format_event(event):
return {"description": event["description"]["text"].encode("utf-8").rstrip("\n\r"),
"category_id": event["category_id"],
"subcategory_id": event["subcategory_id"]}
May be a few other small things you need to do, but using DictWriter and formatting your data appropriately has been the easiest way to work with csv files that I've found.

<class 'UnicodeDecodeError'> that only appears in Python 3 but not Python 2

I am doing a project analyzing tweets for an Urban Policy class. The purpose of this script is to parse out certain information from JSON files that a colleague downloaded. Here's a link to a sample Tweet I am trying to parse:
https://www.dropbox.com/s/qf1e06601m2mrxr/5thWardChicago.0.json?dl=0
I had a friend of mine test the following script in some version of Python 2 (Windows) and it worked. However, my machine (Windows 10) is running a recent version of Python 3 and its not working for me.
import json
import collections
import sys, os
import glob
from datetime import datetime
import csv
def convert(input):
if isinstance(input, dict):
return {convert(key): convert(value) for key, value in input.iteritems()}
elif isinstance(input, list):
return [convert(element) for element in input]
elif isinstance(input, unicode):
return input.encode('utf-8')
else:
return input
def to_ilan_csv(json_files):
# write the column headers
csv_writer = csv.writer(open("test.csv", "w"))
headers = ["tweet_id", "handle", "username", "tweet_text", "has_image", "image_url", "created_at", "retweets", "hashtags", "mentions", "isRT", "isMT"]
csv_writer.writerow(headers)
# open the JSON files we stored and parse them into the CSV file we're working on
try:
#json_files = glob.glob(folder)
print("Parsing %s files." % len(json_files))
for file in json_files:
f = open(file, 'r')
if f != None:
for line in f:
# hack to avoid the trailing \n at the end of the file - sitcking point LH 4/7/16
if len(line) > 3:
i = 0
tweets = convert(json.loads(line))
for tweet in tweets:
has_media = False
is_RT = False
is_MT = False
hashtags_list = []
mentions_list = []
media_list = []
entities = tweet["entities"]
# old tweets don't have key "media" so need a workaround
if entities.has_key("media"):
has_media = True
for item in entities["media"]:
media_list.append(item["media_url"])
for hashtag in entities["hashtags"] :
hashtags_list.append(hashtag["text"])
for user in entities["user_mentions"]:
mentions_list.append(user["screen_name"])
if tweet["text"][:2] == "RT":
is_RT = True
if tweet["text"][:2] == "MT":
is_MT = True
values = [
tweet["id_str"],
tweet["user"]["id_str"],
tweet["user"]["screen_name"],
tweet["text"],
has_media,
','.join(media_list) if len(media_list) > 0 else "",
datetime.strptime(tweet["created_at"], '%a %b %d %H:%M:%S +0000 %Y').strftime('%Y-%m-%d %H:%M:%S'),
tweet["retweet_count"],
','.join(hashtags_list) if len(hashtags_list) > 0 else "",
','.join(mentions_list) if len(mentions_list) > 0 else "",
is_RT,
is_MT
]
csv_writer.writerow(values)
else:
continue
f.close()
except:
print("Something went wrong. Quitting.")
for i in sys.exc_info():
print(i)
def parse_tweets():
file_names = []
file_names.append("C:\\Users\\Adam\\Downloads\\Test Code\\sample1.json")
file_names.append("C:\\Users\\Adam\\Downloads\\Test Code\\sample2.json")
to_ilan_csv(file_names)
Then I execute by simply performing
parse_tweets()
But I get the following error:
Parsing 2 files.
Something went wrong. Quitting.
<class 'UnicodeDecodeError'>
'charmap' codec can't decode byte 0x9d in position 3338: character maps to <undefined>
<traceback object at 0x0000016CCFEE5648>
I sought help from a CS friend of mine but he was unable to diagnose the problem. So I've come here.
MY QUESTION
What is this error and why is it only arising in Python 3 instead of Python 2?
For those who want to try, the code as presented should be able to be run using a Jupyter notebook and the copy of the file in the drop box link I provided.
Sooo, after a bit debugging in chat, here’s the solution:
Apparently, the file OP was using was not correctly recognized as UTF-8, so iterating over the file (with for line in f) caused the UnicodeDecodeError from the cp1252 encoding module. We fixed that by explicitely opening the file as utf-8:
f = open(file, 'r', encoding='utf-8')
After we did that, the file could be opened correctly and OP ran into the Python 3 issues we all have been expecting and seeing before. The following three issues came up:
'dict' object has no attribute 'iteritems'
dict.iteritems() no longer exists in Python 3, so we just switch to dict.items() here:
return {convert(key): convert(value) for key, value in input.items()}
name 'unicode' is not defined
Unicode is no longer a separate type in Python 3, the normal string type is already capable of unicode, so we just delete this case:
elif isinstance(input, unicode):
return input.encode('utf-8')
'dict' object has no attribute 'has_key'
To check whether a key exists in a dictionary, we use the in operator, so the if check becomes the following:
if "media" in entities:
Afterwards, the code should run fine with Python 3.

Loading a json file in python

I've got multiple file to load as JSON, they are all formatted the same way but for one of them I can't load it without raising an exception. This is where you can find the file:
File
I did the following code:
def from_seed_data_extract_summoners():
summonerIds = set()
for i in range(1,11):
file_name = 'data/matches%s.json' % i
print file_name
with open(file_name) as data_file:
data = json.load(data_file)
for match in data['matches']:
for summoner in match['participantIdentities']:
summonerIds.add(summoner['player']['summonerId'])
return summonerIds
The error occurs when I do the following: json.load(data_file). I suppose there is a special character but I can't find it and don't know how to replace it. The error generated is:
UnicodeDecodeError: 'utf8' codec can't decode byte 0xeb in position 6: invalid continuation byte
Do you know how I can get ride of it?
Your JSON is trying to force the data into unicode, not just a simple string. You've got some embedded character (probably a space or something not very noticable) that is not able to be forced into unicode.
How to get string objects instead of Unicode ones from JSON in Python?
That is a great thread about making JSON objects more manageable in python.
replace file_name = 'data/matches%s.json' % i with file_name = 'data/matches%i.json' % i
the right syntax is data = json.load(file_name) and not -
with open(file_name) as data_file:
data = json.load(data_file)
EDIT:
def from_seed_data_extract_summoners():
summonerIds = set()
for i in range(1,11):
file_name = 'data/matches%i.json' % i
with open(file_path) as f:
data = json.load(f, encoding='utf-8')
for match in data['matches']:
for summoner in match['participantIdentities']:
summonerIds.add(summoner['player']['summonerId'])
return summonerIds
Try:
json.loads(unicode(data_file.read(), errors='ignore'))
or :
json.loads(unidecode.unidecode(unicode(data_file.read(), errors='ignore')))
(for the second, you would need to install unidecode)
try :
json.loads(data_file.read(), encoding='utf-8')

Creating wiki links reading form xml in python

I am trying to read all the links in the tag and then trying to create wiki links out of it...basically I want to read each link from the xml file and then create wiki links with the last word(please see below on what I mean by lastword) of the link...for somereason am running into following error,what I am missing,please suggest
http://wiki.build.com/ca_builds/CIT (last word is CIT)
http://wiki.build.com/ca_builds/1.2_Archive(last word is 1.2_Archive)
INPUT XML:-
<returnLink>
http://wiki.build.com/ca_builds/CIT
http://wiki.build.com/ca_builds/1.2_Archive
</returnLink>
PYTHON code
def getReturnLink(xml):
"""Collects the link to return to the PL home page from the config file."""
if xml.find('<returnLink>') == -1:
return None
else:
linkStart=xml.find('<returnLink>')
linkEnd=xml.find('</returnLink>')
link=xml[linkStart+12:linkEnd].strip()
link = link.split('\n')
#if link.find('.com') == -1:
#return None
for line in link:
line = line.strip()
print "LINE"
print line
lastword = line.rfind('/') + 1
line = '['+link+' lastword]<br>'
linklis.append(line)
return linklis
OUTPUT:-
line = '['+link+' lastword]<br>'
TypeError: cannot concatenate 'str' and 'list' objects
EXPECTED OUTPUT:-
CIT (this will point to http://wiki.build.com/ca_builds/CIT
1.2_Archive (this will point to http://wiki.build.com/ca_builds/1.2_Archive 1.2_Archive)
Python standard library has xml parser. You can also support multiple <returnLink> elements and Unicode words in an url:
import posixpath
import urllib
import urlparse
from xml.etree import cElementTree as etree
def get_word(url):
basename = posixpath.basename(urlparse.urlsplit(url).path)
return urllib.unquote(basename).decode("utf-8")
urls = (url.strip()
for links in etree.parse(input_filename_or_file).iter('returnLink')
for url in links.text.splitlines())
wikilinks = [u"[{} {}]".format(url, get_word(url))
for url in urls if url]
print(wikilinks)
Note: work with Unicode internally. Convert the text to bytes only to communicate with outside world e.g., when writing to a file.
Example
[http://wiki.build.com/ca_builds/CIT#some-fragment CIT]
[http://wiki.build.com/ca_builds/Unicode%20%28%E2%99%A5%29 Unicode (♥)]
Intead of parsing XML by hand, use a library like lxml:
>>> s = """<returnLink>
... http://wiki.build.com/ca_builds/CIT
... http://wiki.build.com/ca_builds/1.2_Archive
... </returnLink>"""
>>> from lxml import etree
>>> xml_tree = etree.fromstring(s)
>>> links = xml_tree.text.split()
>>> for i in links:
... print '['+i+']'+i[i.rfind('/')+1:]
...
[http://wiki.build.com/ca_builds/CIT]CIT
[http://wiki.build.com/ca_builds/1.2_Archive]1.2_Archive
I'm not sure what you mean by wikilinks, but the above should give you an idea on how to parse the string.
I'm having some difficulty understanding you question, but it seems like you just want to return the string after the last '/' character in the link? You can do this with reverse find.
return link[link.rfind('/') + 1:]

Categories