Get header of a HTTP payload - python

I got a piece of code that is supposed to decode the pcap files to write the images in a directory. I capture packets via wireshark, browse on http websites to get images, and place them in a directory.
def get_header(payload):
try :
header_raw = payload[:payload.index(b'\r\n\r\n')+2]
except ValueError:
sys.stdout.write('-')
sys.stdout.flush()
return None
header = dict(re.findall(r'(?P<name>.*?): (?P<value>.*?)\r\n', header_raw.decode()))
# This line of code is supposed to split out the headers
if 'Content-Type' not in header:
return None
return header
When I try to run it, it gives me this :
Traceback (most recent call last):
File "/home/kali/Documents/Programs/Python/recapper.py", line 79, in <module>
recapper.get_responses()
File "/home/kali/Documents/Programs/Python/recapper.py", line 62, in get_responses
header = get_header(payload)
File "/home/kali/Documents/Programs/Python/recapper.py", line 24, in get_header
header = dict(re.findall(r"(?P<name>.*?): (?P<value>.*?)\r\n", header_raw.decode()))
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc6 in position 1: invalid continuation byte
I tried different things, but can't get it right. Can anyone more experienced than me tell what is the problem or how can I split out the header if i'm doing it wrong ?

Update : I found that the encoding I had to use wasn't utf-8, but ISO-8859-1
Like this: header = dict(re.findall(r'(?P<name>.*?): (?P<value>.*?)\r\n', header_raw.decode(ISO-8859-1))), and it works !

Related

Unable to encode a unicode into a .txt file in python

So while trying to mess around with python i tried making a program which would get me the content from pastebin url's and then save each ones content into a file of their own. I got an error
This is the code :-
import requests
file = open("file.txt", "r", encoding="utf-8").readlines()
for line in file:
link = line.rstrip("\n")
n_link = link.replace("https://pastebin.com/", "https://pastebin.com/raw/")
pastebin = n_link.replace("https://pastebin.com/raw/", "")
r = requests.get(n_link, timeout=3)
x = open(f"{pastebin}.txt", "a+")
x.write(r.text)
x.close
I get the following error :-
Traceback (most recent call last):
File "C:\Users\Lenovo\Desktop\Py\Misc. Scripts\ai.py", line 9, in <module>
x.write(r.text)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python39\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\u2694' in position 9721: character maps to <undefined>
Can somebody help?
You’re doing good at the start by reading in the input file as UTF-8. The only thing you’re missing is to do the same thing with your output file:
x = open(f"{pastebin}.txt", "a+", encoding="utf-8")

Encoding Error with Beautiful Soup: Character Maps to Undefined (Python)

I've written a script that is supposed to retrieve html pages off a site and update their contents. The following function looks for a certain file on my system, then attempts to open it and edit it:
def update_sn(files_to_update, sn, table, title):
paths = files_to_update['files']
print('updating the sn')
try:
sn_htm = [s for s in paths if re.search('^((?!(Default|Notes|Latest_Addings)).)*htm$', s)][0]
notes_htm = [s for s in paths if re.search('_Notes\.htm$', s)][0]
except Exception:
print('no sns were found')
pass
new_path_name = new_path(sn_htm, files_to_update['predecessor'], files_to_update['original'])
new_sn_number = sn
htm_text = open(sn_htm, 'rb').read().decode('cp1252')
content = re.findall(r'(<table>.*?<\/table>.*)(?:<\/html>)', htm_text, re.I | re.S)
minus_content = htm_text.replace(content[0], '')
table_soup = BeautifulSoup(table, 'html.parser')
new_soup = BeautifulSoup(minus_content, 'html.parser')
head_title = new_soup.title.string.replace_with(new_sn_number)
new_soup.link.insert_after(table_soup.div.next)
with open(new_path_name, "w+") as file:
result = str(new_soup)
try:
file.write(result)
except Exception:
print('Met exception. Changing encoding to cp1252')
try:
file.write(result('cp1252'))
except Exception:
print('cp1252 did\'nt work. Changing encoding to utf-8')
file.write(result.encode('utf8'))
try:
print('utf8 did\'nt work. Changing encoding to utf-16')
file.write(result.encode('utf16'))
except Exception:
pass
This works in the majority of cases, but sometimes it fails to write, at which point the exception kicks in and I try every feasible encoding without success:
updating the sn
Met exception. Changing encoding to cp1252
cp1252 did'nt work. Changing encoding to utf-8
Traceback (most recent call last):
File "C:\Users\Joseph\Desktop\SN Script\update_files.py", line 145, in update_sn
file.write(result)
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 4006-4007: character maps to <undefined>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Joseph\Desktop\SN Script\update_files.py", line 149, in update_sn
file.write(result('cp1252'))
TypeError: 'str' object is not callable
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "scraper.py", line 79, in <module>
get_latest(entries[0], int(num), entries[1])
File "scraper.py", line 56, in get_latest
update_files.update_sn(files_to_update, data['number'], data['table'], data['title'])
File "C:\Users\Joseph\Desktop\SN Script\update_files.py", line 152, in update_sn
file.write(result.encode('utf8'))
TypeError: write() argument must be str, not bytes
Can anyone give me any pointers on how to better handle html data that might have inconsistent encoding?
In your code you open the file in text mode, but then you attempt to write bytes (str.encode returns bytes) and so Python throws an exception:
TypeError: write() argument must be str, not bytes
If you want to write bytes, you should open the file in binary mode.
BeautifulSoup detects the document’s encoding (if it is bytes) and converts it to string automatically. We can access the encoding with .original_encoding, and use it to encode the content when writting to file. For example,
soup = BeautifulSoup(b'<tag>ascii characters</tag>', 'html.parser')
data = soup.tag.text
encoding = soup.original_encoding or 'utf-8'
print(encoding)
#ascii
with open('my.file', 'wb+') as file:
file.write(data.encode(encoding))
In order for this to work you should pass your html as bytes to BeautifulSoup, so don't decode the response content.
If BeautifulSoup fails to detect the correct encoding for some reason, then you could try a list of possible encodings, like you have done in your code.
data = 'Somé téxt'
encodings = ['ascii', 'utf-8', 'cp1252']
with open('my.file', 'wb+') as file:
for encoding in encodings:
try:
file.write(data.encode(encoding))
break
except UnicodeEncodeError:
print(encoding + ' failed.')
Alternatively, you could open the file in text mode and set the encoding in open (instead of encoding the content), but note that this option is not available in Python2.
Just out of curiosity, is this line of code a typo file.write(result('cp1252'))? Seems like it is missing .encode method.
Traceback (most recent call last):
File "C:\Users\Joseph\Desktop\SN Script\update_files.py", line 149, in update_sn
file.write(result('cp1252'))
TypeError: 'str' object is not callable
Will it work perfectly if you modify the code to: file.write(result.encode('cp1252'))
I once had this write to file with encoding problem and brewed my own solution through the following thread:
Saving utf-8 texts in json.dumps as UTF8, not as \u escape sequence
.
My problem solved by changing the html.parser parsing mode to html5lib. I root-caused my problem due to malformed HTML tag and solved it with html5lib parser. For your reference, this is the documentation for each parser provided by BeautifulSoup.
Hope this helps

Python: File encoding errors

From a few days I'm struggling this annoying problem with file encoding in my little program in Python.
I work a lot with MediaWiki - recently I do documents conversion from .doc to Wikisource.
Document in Microsoft Word format is opened in Libre Office and then exported to .txt file with Wikisource format. My program is searching for a [[Image:]] tag and replace it with a name of image taken from a list - and that mechanism works really fine (Big Thanks for help brjaga!).
When I did some test on .txt files created by me everything worked just fine but when I put a .txt file with Wikisource whole thing is not so funny anymore :D
I got this message prom Python:
Traceback (most recent call last):
File "C:\Python33\final.py", line 15, in <module>
s = ' '.join([line.replace('\n', '') for line in myfile.readlines()])
File "C:\Python33\lib\encodings\cp1250.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 7389: character maps to <undefined>
And this is my Python code:
li = [
"[[Image:124_BPP_PL_PL_Page_03_Image_0001.jpg]]",
"[[Image:124_BPP_PL_PL_Page_03_Image_0002.jpg]]",
"[[Image:124_BPP_PL_PL_Page_03_Image_0003.jpg]]",
"[[Image:124_BPP_PL_PL_Page_03_Image_0004.jpg]]",
"[[Image:124_BPP_PL_PL_Page_03_Image_0005.jpg]]",
"[[Image:124_BPP_PL_PL_Page_03_Image_0006.jpg]]",
"[[Image:124_BPP_PL_PL_Page_03_Image_0007.jpg]]",
"[[Image:124_BPP_PL_PL_Page_05_Image_0001.jpg]]",
"[[Image:124_BPP_PL_PL_Page_05_Image_0002.jpg]]"
]
with open ("C:\\124_BPP_PL_PL.txt") as myfile:
s = ' '.join([line.replace('\n', '') for line in myfile.readlines()])
dest = open('C:\\124_BPP_PL_PL_processed.txt', 'w')
for item in li:
s = s.replace("[[Image:]]", item, 1)
dest.write(s)
dest.close()
OK, so I did some research and found that this is a problem with encoding. So I installed a program Notepad++ and changed the encoding of my .txt file with Wikisource to: UTF-8 and saved it. Then I did some change in my code:
with open ("C:\\124_BPP_PL_PL.txt", encoding="utf8') as myfile:
s = ' '.join([line.replace('\n', '') for line in myfile.readlines()])
But I got this new error message:
Traceback (most recent call last):
File "C:\Python33\final.py", line 22, in <module>
dest.write(s)
File "C:\Python33\lib\encodings\cp1250.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\ufeff' in position 0: character maps to <undefined>
And I'm really stuck on this one. I thought, when I change the encoding manually in Notepad++ and then I will tell the encoding which I set - everything will be good.
Please help, Thank You in advance.
When Python 3 opens a text file, it uses the default encoding for your system when trying to decode the file in order to give you full Unicode text (the str type is fully Unicode aware). It does the same when writing out such Unicode text values.
You already solved the input side; you specified an encoding when reading. Do the same when writing: specify a codec to use to write out the file that can handle Unicode, including the non-breaking whitespace character at codepoint U+FEFF. UTF-8 is usually a good default choice:
dest = open('C:\\124_BPP_PL_PL_processed.txt', 'w', encoding='utf8')
You can use the with statement when writing too and save yourself the .close() call:
for item in li:
s = s.replace("[[Image:]]", item, 1)
with open('C:\\124_BPP_PL_PL_processed.txt', 'w', encoding='utf8') as dest:
dest.write(s)

While reading file on Python, I got a UnicodeDecodeError. What can I do to resolve this?

This is one of my own projects. This will later help benefit other people in a game I am playing (AssaultCube). Its purpose is to break down the log file and make it easier for users to read.
I kept getting this issue. Anyone know how to fix this? Currently, I am not planning to write/create the file. I just want this error to be fixed.
The line that triggered the error is a blank line (it stopped on line 66346).
This is what the relevant part of my script looks like:
log = open('/Users/Owner/Desktop/Exodus Logs/DIRTYLOGS/serverlog_20130430_00.15.21.txt', 'r')
for line in log:
and the exception is:
Traceback (most recent call last):
File "C:\Users\Owner\Desktop\Exodus Logs\Log File Translater.py", line 159, in <module>
main()
File "C:\Users\Owner\Desktop\Exodus Logs\Log File Translater.py", line 7, in main
for line in log:
File "C:\Python32\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 3074: character maps to <undefined>
Try:
enc = 'utf-8'
log = open('/Users/Owner/Desktop/Exodus Logs/DIRTYLOGS/serverlog_20130430_00.15.21.txt', 'r', encoding=enc)
if it won't work try:
enc = 'utf-16'
log = open('/Users/Owner/Desktop/Exodus Logs/DIRTYLOGS/serverlog_20130430_00.15.21.txt', 'r', encoding=enc)
you could also try it with
enc = 'iso-8859-15'
also try:
enc = 'cp437'
wich is very old but it also has the "ü" at 0x81 wich would fit to the string "üßer" wich I found on the homepage of assault cube.
If all the codings are wrong try to contact some of the guys developing assault cube or as mentioned in a comment: have a look at https://pypi.python.org/pypi/chardet

XML Encoding error while writing it in to file

I think I am following the right approach but I am still getting an encoding error:
from xml.dom.minidom import Document
import codecs
doc = Document()
wml = doc.createElement("wml")
doc.appendChild(wml)
property = doc.createElement("property")
wml.appendChild(property)
descriptionNode = doc.createElement("description")
property.appendChild(descriptionNode)
descriptionText = doc.createTextNode(description.decode('ISO-8859-1'))
descriptionNode.appendChild(descriptionText)
file = codecs.open('contentFinal.xml', 'w', encoding='ISO-8859-1')
file.write(doc.toprettyxml())
file.close()
The description node contains some characters in ISO-8859-1 encoding, this is encoding specified by the site it self in meta tag. But when doc.toprettyxml() starts writing in file I got following error:
Traceback (most recent call last):
File "main.py", line 467, in <module>
file.write(doc.toprettyxml())
File "C:\Python27\lib\xml\dom\minidom.py", line 60, in toprettyxml
return writer.getvalue()
File "C:\Python27\lib\StringIO.py", line 271, in getvalue
self.buf += ''.join(self.buflist)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe1 in position 10: ordinal not in range(128)
Why am I getting this error as I am decoding and encoding with same standard?
Edited
I have following deceleration in my script file:
#!/usr/bin/python
# -*- coding: utf-8 -*-
may be this is conflicting?
Ok i have found a solution. When ever data is in other foriegn language you just need to defined the proper encoding in xml header. You do not need to describe encoding in file.write(doc.toprettyxml(encoding='ISO-8859-1')) not even when you are opening a file for writing file = codecs.open('contentFinal.xml', 'w', encoding='ISO-8859-1'). Below is the technique which i used. May be This is not a professional method but that works for me.
file = codecs.open('abc.xml', 'w')
xm = doc.toprettyxml()
xm = xm.replace('<?xml version="1.0" ?>', '<?xml version="1.0" encoding="ISO-8859-1"?>')
file.write(xm)
file.close()
May be there is a method to set default encoding in header but i could not find it.
Above method does not bring any error on browser and all data display perfectly.

Categories