python script to scan a pdf file using online scanner

python script to scan a pdf file using online scanner - python

I used this code to scan multiple PDF files contained in a folder with the online scanner "https://wepawet.iseclab.org/" using this scrip.
import mechanize
import re
import os
def upload_file(uploaded_file):
url = "https://wepawet.iseclab.org/"
br = mechanize.Browser()
br.set_handle_robots(False) # ignore robots
br.open(url)
br.select_form(nr=0)
f = os.path.join("200",uploaded_file)
br.form.add_file(open(f) ,'text/plain', f)
br.form.set_all_readonly(False)
res = br.submit()
content = res.read()
with open("200_clean.html", "a") as f:
f.write(content)
def main():
for file in os.listdir("200"):
upload_file(file)
if __name__ == '__main__':
main()
but after the execution of the code I got the following error:
Traceback (most recent call last):
File "test.py", line 56, in <module>
main()
File "test.py", line 50, in main
upload_file(file)
File "test.py", line 40, in upload_file
res = br.submit()
File "/home/suleiman/Desktop/mechanize/_mechanize.py", line 541, in submit
return self.open(self.click(*args, **kwds))
File "/home/suleiman/Desktop/mechanize/_mechanize.py", line 203, in open
return self._mech_open(url, data, timeout=timeout)
File "/home/suleiman/Desktop/mechanize/_mechanize.py", line 255, in _mech_open
raise response
mechanize._response.httperror_seek_wrapper: HTTP Error refresh: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
OK
could any one help me with this problem ?

I think the issue is the mime-type text/plain you set. For PDF, this should be application/pdf. Your code with this change worked for me when I uploaded a sample PDF.
Change the br.form.add_file call to look like this:
br.form.add_file(open(f), 'application/pdf', f)

Related

xml.parsers.expat.ExpatError: not well-formed (invalid token) python3?

I have this code on my python3 for E2 (dreambox)
from xml.dom import Node, minidom
from urllib.request import urlopen, Request
selectedserverurl = 'http://fairbird.liveblog365.com/TSpanel/TSipanel.xml'
def downloadxmlpage():
req = Request(selectedserverurl)
response = urlopen(req)
data = response.read()
response.close()
print("data:",data)
gotPageLoad(data)
print("gotPageLoad(data):", gotPageLoad(data))
def gotPageLoad(data = None):
if data != None:
xmlparse = minidom.parseString(data)
for plugins in xmlparse.getElementsByTagName('plugins'):
item = plugins.getAttribute('cont')
if 'TSpanel' in item:
for plugin in plugins.getElementsByTagName('plugin'):
tsitem = plugin.getAttribute('name')
print("tsitem:", tsitem)
downloadxmlpage()
I have try to read this file and extract the content from it
http://fairbird.liveblog365.com/TSpanel/TSipanel.xml
But I have got this error !!
data: b'<html><body><script type="text/javascript" src="/aes.js" ></script><script>function toNumbers(d){var e=[];d.replace(/(..)/g,function(d){e.push(parseInt(d,16))});return e}function toHex(){for(var d=[],d=1==arguments.length&&arguments[0].constructor==Array?arguments[0]:arguments,e="",f=0;f<d.length;f++)e+=(16>d[f]?"0":"")+d[f].toString(16);return e.toLowerCase()}var a=toNumbers("f655ba9d09a112d4968c63579db590b4"),b=toNumbers("98344c2eee86c3994890592585b49f80"),c=toNumbers("55cc7e99e3f798b6063f25e8b0f8aa76");document.cookie="__test="+toHex(slowAES.decrypt(c,2,a,b))+"; expires=Thu, 31-Dec-37 23:55:55 GMT; path=/"; location.href="http://fairbird.liveblog365.com/TSpanel/TSipanel.xml?i=1";</script><noscript>This site requires Javascript to work, please enable Javascript in your browser or use a browser with Javascript support</noscript></body></html>'
Traceback (most recent call last):
File "/home/raed/Desktop/test.py", line 24, in <module>
downloadxmlpage()
File "/home/raed/Desktop/test.py", line 11, in downloadxmlpage
gotPageLoad(data)
File "/home/raed/Desktop/test.py", line 16, in gotPageLoad
xmlparse = minidom.parseString(data)
File "/usr/lib/python3.10/xml/dom/minidom.py", line 2000, in parseString
return expatbuilder.parseString(string)
File "/usr/lib/python3.10/xml/dom/expatbuilder.py", line 925, in parseString
return builder.parseString(string)
File "/usr/lib/python3.10/xml/dom/expatbuilder.py", line 223, in parseString
parser.Parse(string, True)
xml.parsers.expat.ExpatError: not well-formed (invalid token): line 1, column 222
So How to solve this issue ?!!

Your data output is HTML, not an XML file, therefore the parser is failing.
The HTML redirects to http://fairbird.liveblog365.com/TSpanel/TSipanel.xml?i=1 using Javascript, as shown - This site requires Javascript to work.
This is typically done to prevent anyone from scraping the page/server-files.

Downloading encrpyted/compressed 7z file from slack url

Having an issue trying to download a encrypted/compressed 7zip file. I see the binary data being returned but still getting an error that the file is not downloaded. The information is being grabbed from a slack api by the 'url_private' property. The authentication is good, I see the byte data in my logger but still having issues.
import requests
import py7zr
url= "https://someurl/test_file.7z"
headers = {"auth": "token"}
response = requests.request("GET", url=file_url, headers=headers)
file_name = file_url.split("/")[-1]
if response.status_code == 200:
with py7zr.SevenZipFile(file_name, mode='r', password=pw) as file:
file.extractall(f"templates/files/")
# logger.debug(file_data)
file.close()
Error:
Failed to run listener function (error: [Errno 2] No such file or directory: 'test_file.7z')
Traceback (most recent call last):
File "/opt/venv/lib/python3.8/site-packages/slack_bolt/listener/thread_runner.py", line 103, in run_ack_function_asynchronously
listener.run_ack_function(request=request, response=response)
File "/opt/venv/lib/python3.8/site-packages/slack_bolt/listener/custom_listener.py", line 49, in run_ack_function
return self.ack_function(
File "/app/app/routes/events.py", line 62, in handle_file_shared
file = get_file_shared(file_url)
File "/app/app/lib/file_shared.py", line 28, in get_file_shared
with py7zr.SevenZipFile(file_name, mode='r', password=pw) as file:
File "/opt/venv/lib/python3.8/site-packages/py7zr/py7zr.py", line 324, in __init__
self.fp = open(file, "rb")
FileNotFoundError: [Errno 2] No such file or directory: 'test_file.7z'
I can't seem to find a solution, any help is appreciated.

You had most of it right. One of your problems might be that you didn't import the packages like requests. I found it by a simple google search.
import os
import requests
import py7zr
URL = "https://dl-alt1.winworldpc.com/Microsoft%20Windows%201.01%20Demo%20Slideshow%20(5.25).7z"
filename = os.path.basename(URL)
response = requests.get(URL, stream=True)
if response.status_code == 200:
with open(filename, 'wb') as out:
out.write(response.content)
with py7zr.SevenZipFile(filename, 'r') as archive:
archive.extractall(f"templates/files/")
else:
print('Request failed: %d' % response.status_code)

How to handle the "Traceback" error in python?

Can anyone please tell me How do I remove this Traceback (most recent call last): Error in python. I am using python 2.7.9
Take a look over the code.
import requests
import optparse
parser = optparse.OptionParser()
parser.add_option("-f", '--filename', action="store" ,dest="filee")
options, args = parser.parse_args()
file = options.filee
fopen = open(file, 'r')
for x in fopen.readlines():
print "Checking for Clickjacking vulnerability\n"
url = x.strip('\n')
req = requests.get(url)
try:
print "[-]Target:" + url + " Not vulnerable\n The targeted victim has %s header\n" % (req.headers['X-Frame-Options'])
except Exception as e:
print "[+] Target:" + url +" Vulnerable to clickjacking"
After running the code successfully I go this error at the end
Traceback (most recent call last):
File "C:\Python27\utkarsh3.py", line 17, in <module>
req = requests.get(url)
File "C:\Python27\lib\site-packages\requests\api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\Python27\lib\site-packages\requests\api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 494, in request
prep = self.prepare_request(req)
File "C:\Python27\lib\site-packages\requests\sessions.py", line 437, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Python27\lib\site-packages\requests\models.py", line 305, in prepare
self.prepare_url(url, params)
File "C:\Python27\lib\site-packages\requests\models.py", line 379, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '': No schema supplied. Perhaps you meant http://?
Which really irritate me. I know there are so many peoples who are already asking this before. But I can't understand it so I ask.
And please tell me How we beginners handle these errors?

in a eli5 fashion, a Traceback is a log of what the program was trying to do before actual error happened. Your actual error is requests.exceptions.MissingSchema
The line that follows Invalid URL '': No schema supplied. Perhaps you meant http://? describes the exact problem.
File "C:\Python27\utkarsh3.py", line 17, in <module>
req = requests.get(url)
These above lines describe where the error started..
So, if you go to line 17 of your program you must see this exact same line.
Making a context out of these two things, i get that url is a string that is just example.com and not http://example.com or something on those lines.
I can only speculate so much on what your code might be. But, feel free to provide your code snippets to explain more.
But, hope this helps you to read future tracebacks.
Edit1 : Now that you added snippet. Try printing url just before requests.get(url) and see what exactly you are trying to reach. And, if you have the right schema prepended.

Python web crawler : Connection Timed out

I am trying to implement a simple web crawler and I have already written a simple code to start off : There are two modules fetcher.py and crawler.py. Here are the files :
fetcher.py :
import urllib2
import re
def fetcher(s):
"fetch a web page from a url"
try:
req = urllib2.Request(s)
urlResponse = urllib2.urlopen(req).read()
except urllib2.URLError as e:
print e.reason
return
p,q = s.split("//")
d = q.split("/")
fdes = open(d[0],"w+")
fdes.write(str(urlResponse))
fdes.seek(0)
return fdes
if __name__ == "__main__":
defaultSeed = "http://www.python.org"
print fetcher(defaultSeed)
crawler.py :
from bs4 import BeautifulSoup
import re
from fetchpage import fetcher
usedLinks = open("Used","a+")
newLinks = open("New","w+")
newLinks.seek(0)
def parse(fd,var=0):
soup = BeautifulSoup(fd)
for li in soup.find_all("a",href=re.compile("http")):
newLinks.seek(0,2)
newLinks.write(str(li.get("href")).strip("/"))
newLinks.write("\n")
fd.close()
newLinks.seek(var)
link = newLinks.readline().strip("\n")
return str(link)
def crawler(seed,n):
if n == 0:
usedLinks.close()
newLinks.close()
return
else:
usedLinks.write(seed)
usedLinks.write("\n")
fdes = fetcher(seed)
newSeed = parse(fdes,newLinks.tell())
crawler(newSeed,n-1)
if __name__ == "__main__":
crawler("http://www.python.org/",7)
The problem is that when i run crawler.py it works fine for the first 4-5 links and then it hangs and after a minute gives me the following error :
[Errno 110] Connection timed out
Traceback (most recent call last):
File "crawler.py", line 37, in <module>
crawler("http://www.python.org/",7)
File "crawler.py", line 34, in crawler
crawler(newSeed,n-1)
File "crawler.py", line 34, in crawler
crawler(newSeed,n-1)
File "crawler.py", line 34, in crawler
crawler(newSeed,n-1)
File "crawler.py", line 34, in crawler
crawler(newSeed,n-1)
File "crawler.py", line 34, in crawler
crawler(newSeed,n-1)
File "crawler.py", line 33, in crawler
newSeed = parse(fdes,newLinks.tell())
File "crawler.py", line 11, in parse
soup = BeautifulSoup(fd)
File "/usr/lib/python2.7/dist-packages/bs4/__init__.py", line 169, in __init__
self.builder.prepare_markup(markup, from_encoding))
File "/usr/lib/python2.7/dist-packages/bs4/builder/_lxml.py", line 68, in prepare_markup
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
File "/usr/lib/python2.7/dist-packages/bs4/dammit.py", line 191, in __init__
self._detectEncoding(markup, is_html)
File "/usr/lib/python2.7/dist-packages/bs4/dammit.py", line 362, in _detectEncoding
xml_encoding_match = xml_encoding_re.match(xml_data)
TypeError: expected string or buffer
Can anyone help me with this, I am very new to python and I am unable to find out why does it say connection timed out after some time ?

A Connection Timeout is not specific to python, it just means that you made a request to the server, and the server did not respond within the amount of time that your application was willing to wait.
On very possible reason that this could occur is that python.org may have some mechanism to detect when it is getting multiple requests from a script, and probably just completely stops serving pages after 4-5 requests. There is nothing you can really do to avoid this other than trying out your script on a different site.

You could try using proxies to avoid getting detected on multiple requests as stated above. You might want to check out this answer to get an idea on how to send urllib requests with proxies: How to open website with urllib via Proxy - Python

Python: saving large web page to file

Let me start off by saying, I'm not new to programming but am very new to python.
I've written a program using urllib2 that requests a web page that I would then like to save to a file. The web page is about 300KB, which doesn't strike me as particularly large but seems to be enough to give me trouble, so I'm calling it 'large'.
I'm using a simple call to copy directly from the object returned from urlopen into the file:
file.write(webpage.read())
but it will just sit for minutes, trying to write into the file and I eventually receive the following:
Traceback (most recent call last):
File "program.py", line 51, in <module>
main()
File "program.py", line 43, in main
f.write(webpage.read())
File "/usr/lib/python2.7/socket.py", line 351, in read
data = self._sock.recv(rbufsize)
File "/usr/lib/python2.7/httplib.py", line 541, in read
return self._read_chunked(amt)
File "/usr/lib/python2.7/httplib.py", line 592, in _read_chunked
value.append(self._safe_read(amt))
File "/usr/lib/python2.7/httplib.py", line 649, in _safe_read
raise IncompleteRead(''.join(s), amt)
httplib.IncompleteRead: IncompleteRead(6384 bytes read, 1808 more expected)
I don't know why this should give the program so much grief?
EDIT |
here is how I'm retrieving the page
jar = cookielib.CookieJar()
cookie_processor = urllib2.HTTPCookieProcessor(jar);
opener = urllib2.build_opener(cookie_processor)
urllib2.install_opener(opener)
requ_login = urllib2.Request(LOGIN_PAGE,
data = urllib.urlencode( { 'destination' : "", 'username' : USERNAME, 'password' : PASSWORD } ))
requ_page = urllib2.Request(WEBPAGE)
try:
#login
urllib2.urlopen(requ_login)
#get desired page
portfolio = urllib2.urlopen(requ_page)
except urllib2.URLError as e:
print e.code, ": ", e.reason

I'd use a handy fileobject copier function provided by shutil module. It worked on my machine :)
>>> import urllib2
>>> import shutil
>>> remote_fo = urllib2.urlopen('http://docs.python.org/library/shutil.html')
>>> with open('bigfile', 'wb') as local_fo:
... shutil.copyfileobj(remote_fo, local_fo)
...
>>>
UPDATE: You may want to pass the 3rd argument to copyfileobj that controls the size of internal buffer used to transfer bytes.
UPDATE2: There's nothing fancy about shutil.copyfileobj. It simply reads a chunk of bytes from source file object and writes it the destination file object repeatedly until there's nothing more to read. Here's the actual source code of it that I grabbed from inside Python standard library:
def copyfileobj(fsrc, fdst, length=16*1024):
"""copy data from file-like object fsrc to file-like object fdst"""
while 1:
buf = fsrc.read(length)
if not buf:
break
fdst.write(buf)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

python script to scan a pdf file using online scanner - python

I think the issue is the mime-type text/plain you set. For PDF, this should be application/pdf. Your code with this change worked for me when I uploaded a sample PDF. Change the br.form.add_file call to look like this: br.form.add_file(open(f), 'application/pdf', f)

Related

xml.parsers.expat.ExpatError: not well-formed (invalid token) python3?

Downloading encrpyted/compressed 7z file from slack url

How to handle the "Traceback" error in python?

Python web crawler : Connection Timed out

Python: saving large web page to file

Categories

Resources