Python - unknown url type error - python

from tkinter import *
import tkinter as tk
import pyodbc
import urllib.request
from bs4 import BeautifulSoup,Comment
link =""
def scraper(urls):
with urllib.request.urlopen(urls) as url:
content = url.read()
soup = BeautifulSoup(content, "html.parser")
rows =soup.find_all('div',attrs={"class" : "reviewText"})
for row in soup.find_all('div',attrs={"class" : "reviewText"}):
print(row.text)
root1 = tk.Tk()
label1 = tk.Label(root1, text='product A')
input1 = StringVar()
entry1 = tk.Entry(root1,textvariable=input1)
label1.pack(side = tk.TOP)
entry1.pack()
buttonstr = tk.StringVar()
db = r"C:\Users\Goutham\Documents\keshav\testdb.accdb"
print("connecting db..")
def odbc():
'''
`enter code here`connects with odbc
'''
global link
constr = 'Driver={Microsoft Access Driver (*.mdb, *.accdb)};Dbq=' + db
conn = pyodbc.connect(constr, autocommit=True)
cur = conn.cursor()
check=input1.get()
print("fetching from access.....")
strsql = "select Url from student where PdtName='%s' " % (check,)
cur.execute(strsql)
results = cur.fetchall()
link=check
print (results,check)
conn.close()
buttonA = tk.Button(text = "hello", command = odbc)
buttonA.pack()
scraper(link)
I need this code to get input,store it in the variable -'check' and compare it with the values in the database using a SQL query.The matching values from the database are used to retrieve the URL from the database. The URL is passed as a parameter to the function scraper() which prints the extracted text.
The following error is displayed:
Traceback (most recent call last):
File "C:\Python33\module1.py", line 62, in <module>
scraper(link)
File "C:\Python33\module1.py", line 13, in scraper
with urllib.request.urlopen(urls) as url:
File "C:\Python33\lib\urllib\request.py", line 156, in urlopen
return opener.open(url, data, timeout)
File "C:\Python33\lib\urllib\request.py", line 454, in open
req = Request(fullurl, data)
File "C:\Python33\lib\urllib\request.py", line 275, in __init__
self._parse()
File "C:\Python33\lib\urllib\request.py", line 280, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
Please help.
Thank you.

You are calling scraper(link) at the end of your script, and in that moment link is the empty string. That's why you get ValueError: unknown url type: ''.
Remove that statement and perform a validation of the URL format in your odbc callback function.

Related

'ValueError: unknown url type in' tkinter and urllib

Looks like I am missing something very critical. I am getting this error even before the GUI window pops up or I click the button.
When I enter data in entry, it is supposed to take it and pass it onto the 'url_link' which is further passed inside 'get_data_url'. The 'get_data_url' function is supposed to be executed AFTER the button is pressed, but its being executed in the beginning. I am not sure what's wrong here.
Traceback (most recent call last):
File "gui.py", line 100, in <module>
btn1 = Button(win, text="Submit", command = get_data_url(url_link))
File "gui.py", line 50, in get_data_url
req = Request(url_link, headers={'User-Agent': 'Mozilla/5.0'})
File "/usr/lib/python3.8/urllib/request.py", line 328, in __init__
self.full_url = url
File "/usr/lib/python3.8/urllib/request.py", line 354, in full_url
self._parse()
File "/usr/lib/python3.8/urllib/request.py", line 383, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: '/wp-json/wp/v2/posts/?per_page=100'
My code -
##GUI
import tkinter as tk
from tkinter import messagebox
from tkinter import *
win = tk.Tk()
win.geometry("300x200")
#Label
label = Label(text="URL - ")
label.place(x=20, y=50)
#Entry
entry1 = tk.Entry()
entry1.place(x=70, y=50)
#Execution
##MainCode
import os
import csv
import json
import sys
import requests
import urllib
from urllib.request import Request, urlopen, HTTPError
from urllib.parse import urlparse
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--file', help='To mention file')
parser.add_argument('-u', '--url', help='Passing one url')
parser.add_argument('-p', '--pages', action='store_true', help='To download pages/post')
args = parser.parse_args()
def get_urls(filename):
urls = []
file = open(filename, "r")
for i in file:
i = i.replace("\n", "")
urls.append(i)
return urls
def get_data_url(url_link):
req = Request(url_link, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
## Fetching hostname of the URL
parsed_uri = urlparse(url_link)
result = '{uri.netloc}'.format(uri=parsed_uri)
print(result)
# Write data to file
filename = "data/" + result + "-raw.txt"
file_ = open(filename, 'wb')
file_.write(webpage)
file_.close()
with open(filename) as json_file:
json_data = json.load(json_file)
C_data = []
for n in json_data:
r={}
r["Modified"] = n['modified']
r["Title"] = n['title']['rendered']
r["Content"] = n['content']['rendered']
r["Link"] = n['link']
# JSON Conversion
j_data = {
"modified/posted" : r["Modified"],
"title" : r["Title"],
"content" : r["Content"],
"link" : r["Link"]
}
C_data.append(j_data)
print("Title: " + r["Title"])
print("Status: Downloaded")
json_object = json.dumps(C_data, indent = 4)
# Writing to sample.json
with open("data/" + result + "-data.json", "w") as outfile:
outfile.write(json_object)
print("Extracted Successfully")
urlhere = entry1.get()
url_link = urlhere + "/wp-json/wp/v2/posts/?per_page=100"
#Button
btn1 = Button(win, text="Submit", command = get_data_url(url_link))
btn1.place(x=90, y=80)
win.mainloop()
The error is probably due to event driven programming, you are assigning the value of url_here during runtime which means, it will be empty(as the box is empty at the beginning), so to fix it, move it to inside the function, like:
# Same code
def get_data_url():
urlhere = entry1.get()
url_link = urlhere + "/wp-json/wp/v2/posts/?per_page=100"
req = Request(url_link, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
## Fetching hostname of the URL
.... # Same code
btn1 = Button(win, text="Submit", command=get_data_url)
You can get rid of the parameter as you don't have to use it anymore.
You should get the content of Entry inside the callback of Submit button, construct the URL and call get_data_url():
def submit():
urlhere = entry1.get()
url_link = urlhere + "/wp-json/wp/v2/posts/?per_page=100"
get_data_url(url_link)
btn1 = Button(win, text="Submit", command=submit)

Fetch Error using imaplib

I am writing a script to go through my email to calculate how much money I have spent in total on Uber rides. (Uber sends you a receipt to your email which includes the cost. I am going through the emails to find the cost, and then adding it to the array currently) I had it functioning for tests using 1 email, but am running into an issue whilst trying to loop through all of the emails.
I know that id_list(list of email ids) is a full array. When I print it out, I receive: ['4726', '5543', '5587', '5589', '5661', '5758', '5759', '5853', '5986', '6071', '6072', '6076', '6105', '6141', '6229']
Here is my full error traceback:
Traceback (most recent call last):
File "/Users/Harrison/Desktop/Uber/Uber.py", line 22, in <module>
result,data = mail.fetch(id, "(RFC822")
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/imaplib.py", line 456, in fetch
typ, dat = self._simple_command(name, message_set, message_parts)
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/imaplib.py", line 1088, in _simple_command
return self._command_complete(name, self._command(name, *args))
File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/imaplib.py", line 918, in _command_complete
raise self.error('%s command error: %s %s' % (name, typ, data))
imaplib.error: FETCH command error: BAD ['Could not parse command']
And here is my code:
import imaplib
import email
from bs4 import BeautifulSoup
final_cost1 = ""
final_cost2 = ""
cost_array = []
mail = imaplib.IMAP4_SSL('imap.gmail.com')
mail.login('my email', 'my passowrd')
mail.list()
mail.select('inbox')
result,data = mail.search(None, 'FROM', '"Uber Receipts"')
ids = data[0]
id_list = ids.split()
for id in id_list:
result,data = mail.fetch(id, "(RFC822")
message_body = data[0][1]
uber_email = email.message_from_string(message_body)
for part in uber_email.walk():
if part.get_content_type() == "text/html":
body = part.get_payload(None, decode=True)
soup = BeautifulSoup(body, 'html.parser')
#print soup.prettify()
for row in soup.find_all('td', attrs={"class" : "price final-charge"}):
final_cost1 = row.text.lstrip().strip()
for row in soup.find_all('td', attrs={"class" : "totalPrice chargedFare black"}):
final_cost2 = row.text.lstrip().strip()
if final_cost1 != "":
print final_cost1
cost_array.append(final_cost1)
if final_cost2 != "":
print final_cost2
cost_array.append(final_cost2)
print cost_array
Silly mistake on my part.
This line result,data = mail.fetch(id, "(RFC822") had a typo. It should have been result,data = mail.fetch(id, "(RFC822)")

Issue downloading csv file from website in Python

So I am trying to download and write a csv file onto my computer from a site that requires my Email Address and password as authentication for the site. I have the following code:
import cStringIO
import pycurl
import urllib
url = 'http://www.riglocator.ca/report=rig%2Frig%2D150226%2Ecsv'
def GetPage(url, proxy=None):
if proxy:
port = 8888
proxy = proxy.replace("socks://", "")
if ":" in proxy:
port = int(proxy.rsplit(":", 1)[1])
proxy = proxy.rsplit(":", 1)[0]
try:
buf = cStringIO.StringIO()
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEFUNCTION, buf.write)
c.setopt(c.CONNECTTIMEOUT, 5)
c.setopt(c.TIMEOUT, 8)
if proxy:
c.setopt(pycurl.PROXY, proxy)
c.setopt(pycurl.PROXYPORT, port)
c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
c.setopt(pycurl.USERPWD, 'john#mail.com:password123')
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
c.close()
results = buf.getvalue()
buf.close()
except:
results = ""
return results
GetPage(url,"socks://127.0.0.1:8888")
def loader():
csv_url = GetPage(url,"socks://127.0.0.1:8888")
r = urllib.urlopen(csv_url)
print(r)
csv = r.read()
csv_str = str(csv)
lines = csv_str.split('\\n')
dest_url = r'mapfile.csv'
fx = open(dest_url, 'w')
for line in lines:
fx.write(line + '\n')
fx.close()
loader()
But this still returns the HTML code from the login page, any suggestions?
I am getting this error:
File "C:/Users/cevans/PycharmProjects/RigLocatorMapPull/rigmapscrape.py", line 55, in <module>
loader()
File "C:/Users/cevans/PycharmProjects/RigLocatorMapPull/rigmapscrape.py", line 44, in loader
r = urllib.urlopen(csv_url)
File "C:\Python27\lib\urllib.py", line 87, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 208, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 463, in open_file
return self.open_local_file(url)
File "C:\Python27\lib\urllib.py", line 477, in open_local_file
raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] The system cannot find the path specified: ''
Process finished with exit code 1
Here is a link to some code I wrote to grab a file with pycurl, it should do basically what you need to do. You just need to add the option c.setopt(pycurl.USERPWD, 'username:userpass') do my code to set your username and password.
http://prestongarrison.com/proper-python-pycurl-example/
#This is a solution using the Mechanize browser library which takes the url,
#changes it to the current date, submits the username/password in a form,
#downloads a csv and writes it to a folder location:
__author__ = 'cevans'
import mechanize
import os
import cookielib
import datetime, string
USERNAME = 'xxxx'
PASSWORD = 'xxxxx'
OLDURL = 'http://www.oldurl.com/report050301'
folder = r'\\Driver'
def loader():
#Takes current date and changes URL to grab correct datefile (Schedule only runs on day of week)
cdate = str(datetime.date.today().strftime("%y%m%d"))
DATAURL = string.replace(OLDURL,'150301',cdate)
# Browser and Cookie Jar
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(False)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(True)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Opens site:
r = br.open(DATAURL)
html = r.read()
br.select_form(nr=0)
br.form['nauthemail']= USERNAME
br.form['password']=PASSWORD
br.submit()
r = br.open(DATAURL)
#Read and write file to csv, in folder
csv = r.read()
csv_str = str(csv)
lines = csv_str.split('\\n')
fname = 'map-'+ cdate
base_filename=fname
filename_suffix = '.csv'
folder1 = os.path.join(folder, base_filename + filename_suffix)
dest_url = folder1
fx = open(dest_url, 'w')
for line in lines:
fx.write(line + '\n')
fx.close()
loader()

How do I debug this error with Python?

My code that I will post below gives me this error and I can't figure out why or how to fix it. If anyone could help I would greatly appreciate it. Thanks!
Traceback (most recent call last):
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 99, in <module>
main()
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 76, in main
for final_url in pool.imap(handle_listing, listings):
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
val = self.waiters.get().wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
return self._exit_event.wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
current.throw(*self._exc)
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
result = function(*args, **kwargs)
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 48, in handle_listing
yellow_page = BeautifulSoup(download(yellow_page_url))
File "build\bdist.win32\egg\BeautifulSoup.py", line 1519, in __init__
BeautifulStoneSoup.__init__(self, *args, **kwargs)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1144, in __init__
self._feed(isHTML=isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1168, in _feed
smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1770, in __init__
self._detectEncoding(markup, isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1915, in _detectEncoding
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
TypeError: expected string or buffer
I don't know what it wants or what it means...
This is my code:
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b':
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
# define an alphabet
alfa = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':'', '-Suite-' + alfa[1-26] : ''}
if TITLE_MATCH.match(listing_title) is not None:
title, = TITLE_MATCH.match(listing_title).groups()
if ADDRESS_MATCH.match(listing_title) is not None:
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def main():
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
for final_url in pool.imap(handle_listing, listings):
print final_url
"""
if str(final_url) is not None:
url = str(final_url)
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
time.sleep(2)
"""
for a in range(0,1):
START_URL = 'http://www.locationary.com/place/en/US/Arkansas/Fayetteville-page2/?ACTION_TOKEN=NumericAction'
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
if __name__ == '__main__':
main()
A very common mistake made by novices using any language that supports exceptions is that they catch exceptions that they do not actually handle. This leads to hard-to-debug errors since it disrupts the normal flow of the program.
Specifically, catching urllib2.HTTPError in download() is preventing actual problems from being propagated to the rest of the program. Either remove the exception handler altogether, or raise at the end of the handler to maintain flow.

Why do I keep getting this title match error with my Python program?

When I run the following code, I keep getting this error:
Traceback (most recent call last):
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 94, in <module>
main()
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 71, in main
for final_url in pool.imap(handle_listing, listings):
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
val = self.waiters.get().wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
return self._exit_event.wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
current.throw(*self._exc)
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
result = function(*args, **kwargs)
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 35, in handle_listing
title, = TITLE_MATCH.match(listing_title).groups()
AttributeError: 'NoneType' object has no attribute 'groups'
What is wrong?
It has something to do with the Title match but I don't know how to fix it!
If you could help me I would really appreciate it!
Thanks!
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b':
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':''}
title, = TITLE_MATCH.match(listing_title).groups()
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def main():
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
for final_url in pool.imap(handle_listing, listings):
print final_url
if str(final_url) is not None:
url = str(final_url)
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
time.sleep(2)
for a in range(2,3):
START_URL = 'http://www.locationary.com/place/en/US/New_Jersey/Randolph-page' + str(a) + '/?ACTION_TOKEN=NumericAction'
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
if __name__ == '__main__':
main()
Quoting from your error:
title, = TITLE_MATCH.match(listing_title).groups()
AttributeError: 'NoneType' object has no attribute 'groups'
TITLE_MATCH.match(listing_title) returns None, so you can't call .groups().
When a re .match does not find anything to match, it returns None. Since you cannot call .groups() on None, you have to check for a match first. To do that:
Change this:
title, = TITLE_MATCH.match(listing_title).groups()
address, = ADDRESS_MATCH.match(listing_title).groups()
To this:
titleMatch = TITLE_MATCH.match(listing_title)
if titleMatch:
title, = titleMatch.groups()
else:
# handle it
addressMatch = ADDRESS_MATCH.match(listing_title)
if addressMatch:
address, = addressMatch.groups()
else:
# handle it

Categories