Issue downloading csv file from website in Python - python

So I am trying to download and write a csv file onto my computer from a site that requires my Email Address and password as authentication for the site. I have the following code:
import cStringIO
import pycurl
import urllib
url = 'http://www.riglocator.ca/report=rig%2Frig%2D150226%2Ecsv'
def GetPage(url, proxy=None):
if proxy:
port = 8888
proxy = proxy.replace("socks://", "")
if ":" in proxy:
port = int(proxy.rsplit(":", 1)[1])
proxy = proxy.rsplit(":", 1)[0]
try:
buf = cStringIO.StringIO()
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEFUNCTION, buf.write)
c.setopt(c.CONNECTTIMEOUT, 5)
c.setopt(c.TIMEOUT, 8)
if proxy:
c.setopt(pycurl.PROXY, proxy)
c.setopt(pycurl.PROXYPORT, port)
c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)
c.setopt(pycurl.USERPWD, 'john#mail.com:password123')
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
c.close()
results = buf.getvalue()
buf.close()
except:
results = ""
return results
GetPage(url,"socks://127.0.0.1:8888")
def loader():
csv_url = GetPage(url,"socks://127.0.0.1:8888")
r = urllib.urlopen(csv_url)
print(r)
csv = r.read()
csv_str = str(csv)
lines = csv_str.split('\\n')
dest_url = r'mapfile.csv'
fx = open(dest_url, 'w')
for line in lines:
fx.write(line + '\n')
fx.close()
loader()
But this still returns the HTML code from the login page, any suggestions?
I am getting this error:
File "C:/Users/cevans/PycharmProjects/RigLocatorMapPull/rigmapscrape.py", line 55, in <module>
loader()
File "C:/Users/cevans/PycharmProjects/RigLocatorMapPull/rigmapscrape.py", line 44, in loader
r = urllib.urlopen(csv_url)
File "C:\Python27\lib\urllib.py", line 87, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 208, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 463, in open_file
return self.open_local_file(url)
File "C:\Python27\lib\urllib.py", line 477, in open_local_file
raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] The system cannot find the path specified: ''
Process finished with exit code 1

Here is a link to some code I wrote to grab a file with pycurl, it should do basically what you need to do. You just need to add the option c.setopt(pycurl.USERPWD, 'username:userpass') do my code to set your username and password.
http://prestongarrison.com/proper-python-pycurl-example/

#This is a solution using the Mechanize browser library which takes the url,
#changes it to the current date, submits the username/password in a form,
#downloads a csv and writes it to a folder location:
__author__ = 'cevans'
import mechanize
import os
import cookielib
import datetime, string
USERNAME = 'xxxx'
PASSWORD = 'xxxxx'
OLDURL = 'http://www.oldurl.com/report050301'
folder = r'\\Driver'
def loader():
#Takes current date and changes URL to grab correct datefile (Schedule only runs on day of week)
cdate = str(datetime.date.today().strftime("%y%m%d"))
DATAURL = string.replace(OLDURL,'150301',cdate)
# Browser and Cookie Jar
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(False)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(True)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Opens site:
r = br.open(DATAURL)
html = r.read()
br.select_form(nr=0)
br.form['nauthemail']= USERNAME
br.form['password']=PASSWORD
br.submit()
r = br.open(DATAURL)
#Read and write file to csv, in folder
csv = r.read()
csv_str = str(csv)
lines = csv_str.split('\\n')
fname = 'map-'+ cdate
base_filename=fname
filename_suffix = '.csv'
folder1 = os.path.join(folder, base_filename + filename_suffix)
dest_url = folder1
fx = open(dest_url, 'w')
for line in lines:
fx.write(line + '\n')
fx.close()
loader()

Related

Python Upload Files to Sharepoint using Shareplum

I am using this much-shared code to try and upload a file to Sharepoint using Shareplum, into the Shared Documents folder.
import requests
from shareplum import Office365
# Set Login Info
username = 'my.email#address.com'
password = 'myverifiedapppassword'
site_name = 'mysite'
base_path = 'https://xxxxxxxx.sharepoint.com'
doc_library = 'Shared%20Documents'
file_name = "hellotest.txt" #when your file in the same directory
# Obtain auth cookie
authcookie = Office365(base_path, username=username, password=password).GetCookies()
session = requests.Session()
session.cookies = authcookie
session.headers.update({'user-agent': 'python_bite/v1'})
session.headers.update({'accept': 'application/json;odata=verbose'})
session.headers.update({'X-RequestDigest': 'FormDigestValue'})
response = session.post(url=base_path + "/sites/" + site_name + "/_api/web/GetFolderByServerRelativeUrl('" + doc_library + "')/Files/add(url='a.txt',overwrite=true)",
data="")
session.headers.update({'X-RequestDigest': response.headers['X-RequestDigest']})
# Upload file
with open(file_name, 'rb') as file_input:
try:
response = session.post(
url=base_path + "/sites/" + site_name + f"/_api/web/GetFolderByServerRelativeUrl('" + doc_library + "')/Files/add(url='"
+ file_name + "',overwrite=true)",
data=file_input)
print("response: ", response.status_code) #it returns 200
if response.status_code == '200':
print("File uploaded successfully")
except Exception as err:
print("Something went wrong: " + str(err))
print('File Uploaded Successfully')
The problem is occuring wheen running the code....i am always getting a traceback and a keyerror as follows:
Traceback (most recent call last):
File "S:\upload.py", line 22, in
session.headers.update({'X-RequestDigest': response.headers['X-RequestDigest']})
File "C:\Python39\lib\site-packages\requests\structures.py", line 54, in getitem
return self._store[key.lower()][1]
KeyError: 'x-requestdigest'
Something to do with x-requestdigest isnt working properly, in line 22, but i cannot figure out what.
Any tips would be greatly appreciated!!!
thanks
I have tried the below code and it is working.
from shareplum import Office365
from shareplum import Site
from shareplum.site import Version
#Logging info
server_url = "https://example.sharepoint.com/"
site_url = server_url + "sites/my_site_name"
Username = 'myusername'
Password = 'mypassword'
Sharepoint_folder = 'Shared Documents'
fileName = 'myfilename'
def file_upload_to_sharepoint(**context):
authcookie = Office365(server_url, username = Username, password=Password).GetCookies()
site = Site(site_url, version=Version.v365, authcookie=authcookie)
folder = site.Folder(Sharepoint_folder)
with open(fileName, mode='rb') as file:
fileContent = file.read()
folder.upload_file(fileContent, "filename.bin")
file_upload_to_sharepoint()
Let me know if this works for you as well.

Tweets streaming to .txt file with Python

I have the below code and want to write the stream of tweets to a text file. Is there a way to include the output to text file within the same code and save it in the working directory? I am an IDE lover and really don't like using the console. I am new to python (2 weeks), I am an R / R Studio user.
I know I could use:
filename.py > output.txt
I am currently using Rodeo, Python 3.6.1.
import oauth2 as oauth
import urllib.request as urllib
api_key = "##"
api_secret = "##"
access_token_key = "##-##"
access_token_secret = "##"
_debug = 0
oauth_token = oauth.Token(key=access_token_key, secret=access_token_secret)
oauth_consumer = oauth.Consumer(key=api_key, secret=api_secret)
signature_method_hmac_sha1 = oauth.SignatureMethod_HMAC_SHA1()
http_method = "GET"
http_handler = urllib.HTTPHandler(debuglevel=_debug)
https_handler = urllib.HTTPSHandler(debuglevel=_debug)
'''
Construct, sign, and open a twitter request
using the hard-coded credentials above.
'''
def twitterreq(url, method, parameters):
req = oauth.Request.from_consumer_and_token(oauth_consumer,
token=oauth_token,
http_method=http_method,
http_url=url,
parameters=parameters)
req.sign_request(signature_method_hmac_sha1, oauth_consumer, oauth_token)
headers = req.to_header()
if http_method == "POST":
encoded_post_data = req.to_postdata()
else:
encoded_post_data = None
url = req.to_url()
opener = urllib.OpenerDirector()
opener.add_handler(http_handler)
opener.add_handler(https_handler)
response = opener.open(url, encoded_post_data)
f = open("output.txt", "wb")
def fetchsamples():
url = "https://stream.twitter.com/1.1/statuses/sample.json"
parameters = []
response = twitterreq(url, "GET", parameters)
for line in response:
f.write(line)
if __name__ == '__main__':
fetchsamples()
# f.close()
Besides the comment I made previously, I would suggesting checking out this stack overflow question: how to direct output into a txt file in python in windows
To quote:
If you want to do it in Python then you would write:
with open('out.txt', 'w') as f:
f.write(something)`
Obviously this is just a trivial example. You'd clearly do more inside the with block.

Python - unknown url type error

from tkinter import *
import tkinter as tk
import pyodbc
import urllib.request
from bs4 import BeautifulSoup,Comment
link =""
def scraper(urls):
with urllib.request.urlopen(urls) as url:
content = url.read()
soup = BeautifulSoup(content, "html.parser")
rows =soup.find_all('div',attrs={"class" : "reviewText"})
for row in soup.find_all('div',attrs={"class" : "reviewText"}):
print(row.text)
root1 = tk.Tk()
label1 = tk.Label(root1, text='product A')
input1 = StringVar()
entry1 = tk.Entry(root1,textvariable=input1)
label1.pack(side = tk.TOP)
entry1.pack()
buttonstr = tk.StringVar()
db = r"C:\Users\Goutham\Documents\keshav\testdb.accdb"
print("connecting db..")
def odbc():
'''
`enter code here`connects with odbc
'''
global link
constr = 'Driver={Microsoft Access Driver (*.mdb, *.accdb)};Dbq=' + db
conn = pyodbc.connect(constr, autocommit=True)
cur = conn.cursor()
check=input1.get()
print("fetching from access.....")
strsql = "select Url from student where PdtName='%s' " % (check,)
cur.execute(strsql)
results = cur.fetchall()
link=check
print (results,check)
conn.close()
buttonA = tk.Button(text = "hello", command = odbc)
buttonA.pack()
scraper(link)
I need this code to get input,store it in the variable -'check' and compare it with the values in the database using a SQL query.The matching values from the database are used to retrieve the URL from the database. The URL is passed as a parameter to the function scraper() which prints the extracted text.
The following error is displayed:
Traceback (most recent call last):
File "C:\Python33\module1.py", line 62, in <module>
scraper(link)
File "C:\Python33\module1.py", line 13, in scraper
with urllib.request.urlopen(urls) as url:
File "C:\Python33\lib\urllib\request.py", line 156, in urlopen
return opener.open(url, data, timeout)
File "C:\Python33\lib\urllib\request.py", line 454, in open
req = Request(fullurl, data)
File "C:\Python33\lib\urllib\request.py", line 275, in __init__
self._parse()
File "C:\Python33\lib\urllib\request.py", line 280, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
Please help.
Thank you.
You are calling scraper(link) at the end of your script, and in that moment link is the empty string. That's why you get ValueError: unknown url type: ''.
Remove that statement and perform a validation of the URL format in your odbc callback function.

Getting 403 error when trying to parse dropbox events page with python and mechanize

I use this script to get a list of all file updates to a certain directory. I then parse that list to get a list of time slots I have been active in that directory. That way I can quickly see how much time I have spent on the project and know what to charge my client.
I have written a small python script, adapted from this: https://github.com/jncraton/PythonDropboxUploader
I added the bottom function to retrieve a specific events page from https://www.dropbox.com/events?ns=false&n=50
I have used the script before 2 months ago and it worked well, but now I am getting 403: forbidden errors on:
eventSrc = self.browser.open(req).read()
Probably DropBox tries to block scrapers like mine to push programmers to use their API instead, but unfortunately the API doesn't support listing the events.
Can anybody help me out to get it working again?
This is the python code to create the connection:
import mechanize
import urllib
import re
import json
class DropboxConnection:
""" Creates a connection to Dropbox """
email = ""
password = ""
root_ns = ""
token = ""
browser = None
def __init__(self, email, password):
self.email = email
self.password = password
self.login()
self.get_constants()
def login(self):
""" Login to Dropbox and return mechanize browser instance """
# Fire up a browser using mechanize
self.browser = mechanize.Browser()
self.browser.set_handle_equiv(False)
self.browser.set_handle_redirect(True)
self.browser.set_handle_referer(True)
self.browser.set_handle_robots(False)
self.browser.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:14.0) Gecko/20120722 Firefox/14.0.1')]
# Browse to the login page
self.browser.open('https://www.dropbox.com/login')
# Enter the username and password into the login form
isLoginForm = lambda l: l.action == "https://www.dropbox.com/login" and l.method == "POST"
try:
self.browser.select_form(predicate=isLoginForm)
except:
self.browser = None
raise(Exception('Unable to find login form'))
self.browser['login_email'] = self.email
self.browser['login_password'] = self.password
self.browser['t'] = "1230"
# Send the form
response = self.browser.submit()
def get_constants(self):
""" Load constants from page """
home_src = self.browser.open('https://www.dropbox.com/home').read()
try:
self.root_ns = re.findall(r"root_ns: (\d+)", home_src)[0]
self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
except:
raise(Exception("Unable to find constants for AJAX requests"))
def upload_file(self, local_file, remote_dir, remote_file):
""" Upload a local file to Dropbox """
if(not self.is_logged_in()):
raise(Exception("Can't upload when not logged in"))
self.browser.open('https://www.dropbox.com/')
# Add our file upload to the upload form
isUploadForm = lambda u: u.action == "https://dl-web.dropbox.com/upload" and u.method == "POST"
try:
self.browser.select_form(predicate=isUploadForm)
except:
raise(Exception('Unable to find upload form'))
self.browser.form.find_control("dest").readonly = False
self.browser.form.set_value(remote_dir, "dest")
self.browser.form.add_file(open(local_file, "rb"), "", remote_file)
# Submit the form with the file
self.browser.submit()
def get_dir_list(self, remote_dir):
""" Get file info for a directory """
if(not self.is_logged_in()):
raise(Exception("Can't download when not logged in"))
req_vars = "ns_id=" + self.root_ns + "&referrer=&t=" + self.token
req = urllib2.Request('https://www.dropbox.com/browse' + remote_dir, data=req_vars)
req.add_header('Referer', 'https://www.dropbox.com/home' + remote_dir)
dir_info = json.loads(self.browser.open(req).read())
dir_list = {}
for item in dir_info['file_info']:
# Eliminate directories
if(item[0] == False):
# get local filename
absolute_filename = item[3]
local_filename = re.findall(r".*\/(.*)", absolute_filename)[0]
# get file URL and add it to the dictionary
file_url = item[8]
dir_list[local_filename] = file_url
return dir_list
def get_download_url(self, remote_dir, remote_file):
""" Get the URL to download a file """
return self.get_dir_list(remote_dir)[remote_file]
def download_file(self, remote_dir, remote_file, local_file):
""" Download a file and save it locally """
fh = open(local_file, "wb")
fh.write(self.browser.open(self.get_download_url(remote_dir, remote_file)).read())
fh.close()
def is_logged_in(self):
""" Checks if a login has been established """
if(self.browser):
return True
else:
return False
def getEventsPage(self, n):
if(not self.is_logged_in()):
raise(Exception("Can't get event page when not logged in"))
url = 'https://www.dropbox.com/next_events'
values = {'cur_page': n, 'ns_id': 'false'}
data = urllib.urlencode(values)
req = mechanize.Request(url, data)
# print url + '?' + data
eventSrc = self.browser.open(req).read()
return eventSrc
And this is the loop that parses the events pages:
from dbupload import DropboxConnection
from getpass import getpass
from bs4 import BeautifulSoup
import re
import parsedatetime.parsedatetime as pdt
import parsedatetime.parsedatetime_consts as pdc
c = pdc.Constants()
p = pdt.Calendar(c)
email = "myemail#gmail.com" # raw_input("Enter Dropbox email address:")
password = getpass("Enter Dropbox password:")
dateFile = open('all_file_updates.txt', "wb")
try:
# Create the connection
conn = DropboxConnection(email, password)
except:
print("Connection failed")
else:
print("Connection succesful")
n = 250
found = 0
while(n >= 0):
eventsPageSrc = conn.getEventsPage(n)
soup = BeautifulSoup(eventsPageSrc)
table = soup.find("table", {"id": "events"})
for row in table.findAll('tr'):
link = row.find("a", href=re.compile('^https://dl-web.dropbox.com/get/ProjectName'))
if(link != None):
dateString = row.find("td", attrs={'class': 'modified'}).string
date = p.parse(dateString)
dateFile.write('Date: ' + str(date) + ' file: ' + link.string + '\n')
found = found + 1
n = n - 1
print 'page: ' + str(n) + ' Total found: ' + str(found)
In def get_constants(self): change
self.token = re.findall(r"TOKEN: '(.+)'", home_src)[0]
to
self.token = re.findall(r'TOKEN: "(.+)"', home_src)[0]
dropbox has changed the way it stores constants
Hope it helps.

How do I debug this error with Python?

My code that I will post below gives me this error and I can't figure out why or how to fix it. If anyone could help I would greatly appreciate it. Thanks!
Traceback (most recent call last):
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 99, in <module>
main()
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 76, in main
for final_url in pool.imap(handle_listing, listings):
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
val = self.waiters.get().wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
return self._exit_event.wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
current.throw(*self._exc)
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
result = function(*args, **kwargs)
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 48, in handle_listing
yellow_page = BeautifulSoup(download(yellow_page_url))
File "build\bdist.win32\egg\BeautifulSoup.py", line 1519, in __init__
BeautifulStoneSoup.__init__(self, *args, **kwargs)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1144, in __init__
self._feed(isHTML=isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1168, in _feed
smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1770, in __init__
self._detectEncoding(markup, isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1915, in _detectEncoding
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
TypeError: expected string or buffer
I don't know what it wants or what it means...
This is my code:
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b':
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
# define an alphabet
alfa = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':'', '-Suite-' + alfa[1-26] : ''}
if TITLE_MATCH.match(listing_title) is not None:
title, = TITLE_MATCH.match(listing_title).groups()
if ADDRESS_MATCH.match(listing_title) is not None:
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def main():
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
for final_url in pool.imap(handle_listing, listings):
print final_url
"""
if str(final_url) is not None:
url = str(final_url)
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
time.sleep(2)
"""
for a in range(0,1):
START_URL = 'http://www.locationary.com/place/en/US/Arkansas/Fayetteville-page2/?ACTION_TOKEN=NumericAction'
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
if __name__ == '__main__':
main()
A very common mistake made by novices using any language that supports exceptions is that they catch exceptions that they do not actually handle. This leads to hard-to-debug errors since it disrupts the normal flow of the program.
Specifically, catching urllib2.HTTPError in download() is preventing actual problems from being propagated to the rest of the program. Either remove the exception handler altogether, or raise at the end of the handler to maintain flow.

Categories