How explore and download files from web page python3

How explore and download files from web page python3 - python

I have created this GitHub repository with each python version code in case that you guys would like to check the current code.
https://github.com/AndresUrregoAngel/Python3-request/tree/current
I have a script in Python 2 to log in, explore and download a couple of files from a page using lib2 and other modules. I would like to migrate this script to python v3 using request module. Unfortunately, I've not achieved that because I can't keep the session open to explore the page and then download files. Please check the scripts and let me know how could I figure it out.
PythonV2
import os
import time
import urllib
import urllib2
import cookielib
import datetime
# Here are your queue names
QUEUES = {'dis0003': ['dis0003-xxxxxxx', 'dis0003-yyyyyy',
'dis0003-zzzzzzzz'],
'dis0006': ['dis0006-xxxxxxx', 'dis0006-yyyyyyyy',
'dis0006-zzzzzzzz',
'dis0006-mmmmmmm',
'dis0006-nnnnnnnnnn']}
# Your admin login and password
LOGIN = "xxx"
PASSWORD = "xxxxx"
ROOT = "https://xxxxx"
# The client have to take care of the cookies.
jar = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
# POST login query on '/login_handler' (post data are: 'login' and 'password').
req = urllib2.Request(ROOT + "/login_handler",
urllib.urlencode({'login': LOGIN,
'password': PASSWORD}))
opener.open(req)
# Set the right accountcode
for accountcode, queues in QUEUES.items():
req = urllib2.Request(ROOT + "/switch_to/" + accountcode)
opener.open(req)
NOW = datetime.datetime.now()
YEAR = NOW.year
FROM_MONTH = NOW.month
TO_MONTH = NOW.month
FROM_DAY = NOW.day
TO_DAY = NOW.day
from_ts = time.mktime(datetime.datetime(
YEAR, FROM_MONTH, FROM_DAY).timetuple())
to_ts = time.mktime(datetime.datetime(
YEAR, TO_MONTH, TO_DAY).timetuple())
# Get the CSV and write it to files
for queue in queues:
url = "%s/queue/csv/stats/%s/%s/%s" % (
ROOT, queue, int(from_ts), int(to_ts))
sections = []
section = []
for line in opener.open(urllib2.Request(url)).read().split('\n'):
if line:
section.append(line)
else:
sections.append(section)
section = []
if section:
sections.append(section)
for i, section in enumerate(sections):
open(os.path.join("file", "%s-%d.csv" % (queue, i + 1)),
"wb").write('\n'.join(section))

Related

python script wordpress (mix google image +auto post wordpress)

Hello who can help me to mix two script in python, I have a script that automatically posts articles on wordpress and I have another script that donwload the photo on google image, I just want to downloading images from googgle with a keywords and posting them to a Photoblog (new article) running on Wordpres with a single script, I am novice I would need your help please!
sorry for my English
google script:
# coding: utf-8
# In[ ]:
#Searching and Downloading Google Images/Image Links
#Import Libraries
#coding: UTF-8
import time #Importing the time library to check the time of code execution
import sys #Importing the System Library
import os
import urllib2
########### Edit From Here ###########
#This list is used to search keywords. You can edit this list to search for google images of your choice. You can simply add and remove elements of the list.
search_keyword = ['Australia']
#This list is used to further add suffix to your search term. Each element of the list will help you download 100 images. First element is blank which denotes that no suffix is added to the search keyword of the above list. You can edit the list by adding/deleting elements from it.So if the first element of the search_keyword is 'Australia' and the second element of keywords is 'high resolution', then it will search for 'Australia High Resolution'
keywords = [' high resolution']
########### End of Editing ###########
#Downloading entire Web Document (Raw Page Content)
def download_page(url):
version = (3,0)
cur_version = sys.version_info
if cur_version >= version: #If the Current Version of Python is 3.0 or above
import urllib.request #urllib library for Extracting web pages
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = str(resp.read())
return respData
except Exception as e:
print(str(e))
else: #If the Current Version of Python is 2.x
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
return"Page Not found"
#Finding 'Next Image' from the given raw page
def _images_get_next_item(s):
start_line = s.find('rg_di')
if start_line == -1: #If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
#Getting all links with the help of '_images_get_next_image'
def _images_get_all_items(page):
items = []
while True:
item, end_content = _images_get_next_item(page)
if item == "no_links":
break
else:
items.append(item) #Append all the links in the list named 'Links'
time.sleep(0.1) #Timer could be used to slow down the request for image downloads
page = page[end_content:]
return items
############## Main Program ############
t0 = time.time() #start the timer
#Download Image Links
i= 0
while i<len(search_keyword):
items = []
iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
print (iteration)
print ("Evaluating...")
search_keywords = search_keyword[i]
search = search_keywords.replace(' ','%20')
#make a search keyword directory
try:
os.makedirs(search_keywords)
except OSError, e:
if e.errno != 17:
raise
# time.sleep might help here
pass
j = 0
while j<len(keywords):
pure_keyword = keywords[j].replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + pure_keyword + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
time.sleep(0.1)
items = items + (_images_get_all_items(raw_html))
j = j + 1
#print ("Image Links = "+str(items))
print ("Total Image Links = "+str(len(items)))
print ("\n")
#This allows you to write all the links into a test file. This text file will be created in the same directory as your code. You can comment out the below 3 lines to stop writing the output to the text file.
info = open('output.txt', 'a') #Open the text file called database.txt
info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n") #Write the title of the page
info.close() #Close the file
t1 = time.time() #stop the timer
total_time = t1-t0 #Calculating the total time required to crawl, find and download all the links of 60,000 images
print("Total time taken: "+str(total_time)+" Seconds")
print ("Starting Download...")
## To save imges to the same directory
# IN this saving process we are just skipping the URL if there is any error
k=0
errorCount=0
while(k<len(items)):
from urllib2 import Request,urlopen
from urllib2 import URLError, HTTPError
try:
req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
response = urlopen(req,None,15)
output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')
data = response.read()
output_file.write(data)
response.close();
print("completed ====> "+str(k+1))
k=k+1;
except IOError: #If there is any IOError
errorCount+=1
print("IOError on image "+str(k+1))
k=k+1;
except HTTPError as e: #If there is any HTTPError
errorCount+=1
print("HTTPError"+str(k))
k=k+1;
except URLError as e:
errorCount+=1
print("URLError "+str(k))
k=k+1;
i = i+1
print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")
#----End of the main program ----#
# In[ ]:
wordpress script:
import urllib
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods import posts
import xmlrpclib
from wordpress_xmlrpc.compat import xmlrpc_client
from wordpress_xmlrpc.methods import media, posts
import os
########################### Read Me First ###############################
'''
------------------------------------------In DETAIL--------------------------------
Description
===========
Add new posts to WordPress remotely using Python using XMLRPC library provided by the WordPress.
Installation Requirement
************************
Verify you meet the following requirements
==========================================
Install Python 2.7 (Don't download 3+, as most libraries dont yet support version 3).
Install from PyPI using easy_install python-wordpress-xmlrpc
Easy_Install Link: https://pypi.python.org/pypi/setuptools
==========================================
Windows Installation Guide
==========================
-Download and Install Easy_Install from above Link -Extract Downloaded File and from CMD go to the extracted directory and run 'python setup.py install'. This will install easy_install. -Go to %/python27/script and run following command easy_install python-wordpress-xmlrpc
Ubuntu Installation Guide
=========================
sudo apt-get install python-setuptools
sudo easy_install python-wordpress-xmlrpc
Note: Script has its dummy data to work initially which you can change or integrate with your code easily for making it more dynamic.
****************************************
For Bugs/Suggestions
contact#waqasjamal.com
****************************************
------------------------------------------In DETAIL--------------------------------
'''
class Custom_WP_XMLRPC:
def post_article(self,wpUrl,wpUserName,wpPassword,articleTitle, articleCategories, articleContent, articleTags,PhotoUrl):
self.path=os.getcwd()+"\\00000001.jpg"
self.articlePhotoUrl=PhotoUrl
self.wpUrl=wpUrl
self.wpUserName=wpUserName
self.wpPassword=wpPassword
#Download File
f = open(self.path,'wb')
f.write(urllib.urlopen(self.articlePhotoUrl).read())
f.close()
#Upload to WordPress
client = Client(self.wpUrl,self.wpUserName,self.wpPassword)
filename = self.path
# prepare metadata
data = {'name': 'picture.jpg','type': 'image/jpg',}
# read the binary file and let the XMLRPC library encode it into base64
with open(filename, 'rb') as img:
data['bits'] = xmlrpc_client.Binary(img.read())
response = client.call(media.UploadFile(data))
attachment_id = response['id']
#Post
post = WordPressPost()
post.title = articleTitle
post.content = articleContent
post.terms_names = { 'post_tag': articleTags,'category': articleCategories}
post.post_status = 'publish'
post.thumbnail = attachment_id
post.id = client.call(posts.NewPost(post))
print 'Post Successfully posted. Its Id is: ',post.id
#########################################
# POST & Wp Credentials Detail #
#########################################
#Url of Image on the internet
ariclePhotoUrl='http://i1.tribune.com.pk/wp-content/uploads/2013/07/584065-twitter-1375197036-960-640x480.jpg'
# Dont forget the /xmlrpc.php cause thats your posting adress for XML Server
wpUrl='http://YourWebSite.com/xmlrpc.php'
#WordPress Username
wpUserName='WordPressUsername'
#WordPress Password
wpPassword='YourWordPressPassword'
#Post Title
articleTitle='Testing Python Script version 3'
#Post Body/Description
articleContent='Final .... Testing Fully Automated'
#list of tags
articleTags=['code','python']
#list of Categories
articleCategories=['language','art']
#########################################
# Creating Class object & calling the xml rpc custom post Function
#########################################
xmlrpc_object = Custom_WP_XMLRPC()
#On Post submission this function will print the post id
xmlrpc_object.post_article(wpUrl,wpUserName,wpPassword,articleTitle, articleCategories, articleContent, articleTags,ariclePhotoUrl)
for exemple :
search_keyword = ['Australia']
donwload a 20 image frome Google image and update to wordpress like a new posts
#########################################
# POST & Wp Credentials Detail #
#########################################
#Url of Image on the internet
ariclePhotoUrl='https://www.google.com/search?q=' search_keyword +'&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
# Dont forget the /xmlrpc.php cause thats your posting adress for XML Server
wpUrl='http://YourWebSite.com/xmlrpc.php'
#WordPress Username
wpUserName='WordPressUsername'
#WordPress Password
wpPassword='YourWordPressPassword'
#Post Title
articleTitle='Testing Python Script version 3'
#Post Body/Description
articleContent='Final .... Testing Fully Automated'
#list of tags
articleTags=['code','python']
#list of Categories
articleCategories=['language','art']
#########################################
# Creating Class object & calling the xml rpc custom post Function
#########################################
xmlrpc_object = Custom_WP_XMLRPC()
#On Post submission this function will print the post id
xmlrpc_object.post_article(wpUrl,wpUserName,wpPassword,articleTitle, articleCategories, articleContent, articleTags,ariclePhotoUrl)

How to download large files in Python 2

I'm trying to download large files (approx. 1GB) with mechanize module, but I have been unsuccessful. I've been searching for similar threads, but I have found only those, where the files are publicly accessible and no login is required to obtain a file. But this is not my case as the file is located in the private section and I need to login before the download. Here is what I've done so far.
import mechanize
g_form_id = ""
def is_form_found(form1):
return "id" in form1.attrs and form1.attrs['id'] == g_form_id
def select_form_with_id_using_br(br1, id1):
global g_form_id
g_form_id = id1
try:
br1.select_form(predicate=is_form_found)
except mechanize.FormNotFoundError:
print "form not found, id: " + g_form_id
exit()
url_to_login = "https://example.com/"
url_to_file = "https://example.com/download/files/filename=fname.exe"
local_filename = "fname.exe"
br = mechanize.Browser()
br.set_handle_robots(False) # ignore robots
br.set_handle_refresh(False) # can sometimes hang without this
br.addheaders = [('User-agent', 'Firefox')]
response = br.open(url_to_login)
# Find login form
select_form_with_id_using_br(br, 'login-form')
# Fill in data
br.form['email'] = 'email#domain.com'
br.form['password'] = 'password'
br.set_all_readonly(False) # allow everything to be written to
br.submit()
# Try to download file
br.retrieve(url_to_file, local_filename)
But I'm getting an error when 512MB is downloaded:
Traceback (most recent call last):
File "dl.py", line 34, in <module>
br.retrieve(br.retrieve(url_to_file, local_filename)
File "C:\Python27\lib\site-packages\mechanize\_opener.py", line 277, in retrieve
block = fp.read(bs)
File "C:\Python27\lib\site-packages\mechanize\_response.py", line 199, in read
self.__cache.write(data)
MemoryError: out of memory
Do you have any ideas how to solve this?
Thanks

You can use bs4 and requests to get you logged in then write the streamed content. There are a few form fields required including a _token_ field that is definitely necessary:
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin
data = {'email': 'email#domain.com', 'password': 'password'}
base = "https://support.codasip.com"
with requests.Session() as s:
# update headers
s.headers.update({'User-agent': 'Firefox'})
# use bs4 to parse the from fields
soup = BeautifulSoup(s.get(base).content)
form = soup.select_one("#frm-loginForm")
# works as it is a relative path. Not always the case.
action = form["action"]
# Get rest of the fields, ignore password and email.
for inp in form.find_all("input", {"name":True,"value":True}):
name, value = inp["name"], inp["value"]
if name not in data:
data[name] = value
# login
s.post(urljoin(base, action), data=data)
# get protected url
with open(local_filename, "wb") as f:
for chk in s.get(url_to_file, stream=True).iter_content(1024):
f.write(chk)

Try downloading/writing it by chunks. Seems like file takes all your memory.
You should specify Range header for your request if server supports it.
https://en.wikipedia.org/wiki/List_of_HTTP_header_fields

deadline = None after using urlfetch.set_default_fetch_deadline(n)

I'm working on a web application with Python and Google App Engine.
I tried to set the default URLFetch deadline globally as suggested in a previous thread:
https://stackoverflow.com/a/14698687/2653179
urlfetch.set_default_fetch_deadline(45)
However it doesn't work - When I print its value in one of the functions: urlfetch.get_default_fetch_deadline() is None.
Here is main.py:
from google.appengine.api import users
import webapp2
import jinja2
import random
import string
import hashlib
import CQutils
import time
import os
import httpRequests
import logging
from google.appengine.api import urlfetch
urlfetch.set_default_fetch_deadline(45)
...
class Del(webapp2.RequestHandler):
def get(self):
id = self.request.get('id')
ext = self.request.get('ext')
user_id = httpRequests.advance(id,ext)
d2 = urlfetch.get_default_fetch_deadline()
logging.debug("value of deadline = %s", d2)
Prints in the Log console:
DEBUG 2013-09-05 07:38:21,654 main.py:427] value of deadline = None
The function which is being called in httpRequests.py:
def advance(id, ext=None):
url = "http://localhost:8080/api/" + id + "/advance"
if ext is None:
ext = ""
params = urllib.urlencode({'ext': ext})
result = urlfetch.fetch(url=url,
payload=params,
method=urlfetch.POST,
headers={'Content-Type': 'application/x-www-form-urlencoded'})
if (result.status_code == 200):
return result.content

I know this is an old question, but recently ran into the issue.
The setting is placed into a thread-local, meaning that if your application is set to thread-safe and you handle a request in a different thread than the one you set the default deadline for, it can be lost. For me, the solution was to set the deadline before every request as part of the middleware chain.
This is not documented, and required looking through the source to figure it out.

How do you IXR_Base64 in python?

What I'm trying to do is upload a picture to wordpress using wp.uploadFile xmlrpc method.
To do this, in PHP there is an example here: https://stackoverflow.com/a/8910496/1212382
I'm trying to do the same thing in python but I don't know how.
Anyone any ideas?

ok, the answer lies in the xmlrpclib class.
To send base64 bits to wordpress from python you need to use the xmlrpclib class like so:
base64bits = xmlrpclib.Binary(file_content)
then you just add the base64bits variable to the 'bits' parameter in your wp.uploadFile xmlrpc request.
to be a little more exact, here's the complete code in python of how this should be done:
import xmlrpclib
import urllib2
from datetime import date
import time
def get_url_content(url):
try:
content = urllib2.urlopen(url)
return content.read()
except:
print 'error! NOOOOOO!!!'
file_url = 'http://the path to your picture'
extension = file_url.split(".")
leng = extension.__len__()
extension = extension[leng-1]
if (extension=='jpg'):
xfileType = 'image/jpeg'
elif(extension=='png'):
xfileType='image/png'
elif(extension=='bmp'):
xfileType = 'image/bmp'
file = get_url_content(file_url)
file = xmlrpclib.Binary(file)
server = xmlrpclib.Server('http://website.com/xmlrpc.php')
filename = str(date.today())+str(time.strftime('%H:%M:%S'))
mediarray = {'name':filename+'.'+extension,
'type':xfileType,
'bits':file,
'overwrite':'false'}
xarr = ['1', 'USERHERE', 'PASSWORDHERE', mediarray]
result = server.wp.uploadFile(xarr)
print result

How do I search for unpublished Plone content in an IPython debug shell?

I like to use IPython's zope profile to inspect my Plone instance, but a few annoying permissions differences come up compared to inserting a breakpoint and hitting it with the admin user.
For example, I would like to iterate over the content objects in an unpublished testing folder. This query will return no results in the shell, but works from a breakpoint.
$ bin/instance shell
$ ipython --profile=zope
from Products.CMFPlone.utils import getToolByName
catalog = getToolByName(context, 'portal_catalog')
catalog({'path':'Plone/testing'})
Can I authenticate as admin or otherwise rejigger the permissions to fully manipulate my site from ipython?

here's the (very dirty) code I use to manage my plone app from the debug shell. It may requires some updates depending on your versions of Zope and Plone.
from sys import stdin, stdout, exit
import base64
from thread import get_ident
from ZPublisher.HTTPRequest import HTTPRequest
from ZPublisher.HTTPResponse import HTTPResponse
from ZPublisher.BaseRequest import RequestContainer
from ZPublisher import Publish
from AccessControl import ClassSecurityInfo, getSecurityManager
from AccessControl.SecurityManagement import newSecurityManager
from AccessControl.User import UnrestrictedUser
def loginAsUnrestrictedUser():
"""Exemple of use :
old_user = loginAsUnrestrictedUser()
# Manager stuff
loginAsUser(old_user)
"""
current_user = getSecurityManager().getUser()
newSecurityManager(None, UnrestrictedUser('manager', '', ['Manager'], []))
return current_user
def loginAsUser(user):
newSecurityManager(None, user)
def makerequest(app, stdout=stdout, query_string=None, user_pass=None):
"""Make a request suitable for CMF sites & Plone
- user_pass = "user:pass"
"""
# copy from Testing.makerequest
resp = HTTPResponse(stdout=stdout)
env = {}
env['SERVER_NAME'] = 'lxtools.makerequest.fr'
env['SERVER_PORT'] = '80'
env['REQUEST_METHOD'] = 'GET'
env['REMOTE_HOST'] = 'a.distant.host'
env['REMOTE_ADDR'] = '77.77.77.77'
env['HTTP_HOST'] = '127.0.0.1'
env['HTTP_USER_AGENT'] = 'LxToolsUserAgent/1.0'
env['HTTP_ACCEPT']='image/gif, image/x-xbitmap, image/jpeg, */* '
if user_pass:
env['HTTP_AUTHORIZATION']="Basic %s" % base64.encodestring(user_pass)
if query_string:
p_q = query_string.split('?')
if len(p_q) == 1:
env['PATH_INFO'] = p_q[0]
elif len(p_q) == 2:
(env['PATH_INFO'], env['QUERY_STRING'])=p_q
else:
raise TypeError, ''
req = HTTPRequest(stdin, env, resp)
req['URL1']=req['URL'] # fix for CMFQuickInstaller
#
# copy/hacked from Localizer __init__ patches
# first put the needed values in the request
req['HTTP_ACCEPT_CHARSET'] = 'latin-9'
#req.other['AcceptCharset'] = AcceptCharset(req['HTTP_ACCEPT_CHARSET'])
#
req['HTTP_ACCEPT_LANGUAGE'] = 'fr'
#accept_language = AcceptLanguage(req['HTTP_ACCEPT_LANGUAGE'])
#req.other['AcceptLanguage'] = accept_language
# XXX For backwards compatibility
#req.other['USER_PREF_LANGUAGES'] = accept_language
#req.other['AcceptLanguage'] = accept_language
#
# Plone stuff
#req['plone_skin'] = 'Plone Default'
#
# then store the request in Publish._requests
# with the thread id
id = get_ident()
if hasattr(Publish, '_requests'):
# we do not have _requests inside ZopeTestCase
Publish._requests[id] = req
# add a brainless session container
req['SESSION'] = {}
#
# ok, let's wrap
return app.__of__(RequestContainer(REQUEST = req))
def debug_init(app):
loginAsUnrestrictedUser()
app = makerequest(app)
return app
This lives in a wshelpers Zope product. Once the debug shell launched, it's just a matter of;
>> from Products.wshelpers import wsdebug
>> app = wsdebug.debug_init(app)
>> # now you're logged in as admin

Just use catalog.search({'path':'Plone/testing'}). It performs the same query as catalog() but does not filter the results based on the current user's permissions.
IPython's zope profile does provide a method utils.su('username') to change the current user, but it does not recognize the admin user (defined in /acl_users instead of /Plone/acl_users) and after calling it subsequent calls to catalog() fail with AttributeError: 'module' object has no attribute 'checkPermission'.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How explore and download files from web page python3 - python

Related

python script wordpress (mix google image +auto post wordpress)

How to download large files in Python 2

deadline = None after using urlfetch.set_default_fetch_deadline(n)

How do you IXR_Base64 in python?

How do I search for unpublished Plone content in an IPython debug shell?

Categories

Resources