Loading Magnet LINK using Rasterbar libtorrent in Python - python

How would one load a Magnet link via rasterbar libtorrent python binding?

import libtorrent as lt
import time
ses = lt.session()
params = { 'save_path': '/home/downloads/'}
link = "magnet:?xt=urn:btih:4MR6HU7SIHXAXQQFXFJTNLTYSREDR5EI&tr=http://tracker.vodo.net:6970/announce"
handle = lt.add_magnet_uri(ses, link, params)
print 'downloading metadata...'
while (not handle.has_metadata()): time.sleep(1)
print 'got metadata, starting torrent download...'
while (handle.status().state != lt.torrent_status.seeding):
print '%d %% done' % (handle.status().progress*100)
time.sleep(1)

Related

python script wordpress (mix google image +auto post wordpress)

Hello who can help me to mix two script in python, I have a script that automatically posts articles on wordpress and I have another script that donwload the photo on google image, I just want to downloading images from googgle with a keywords and posting them to a Photoblog (new article) running on Wordpres with a single script, I am novice I would need your help please!
sorry for my English
google script:
# coding: utf-8
# In[ ]:
#Searching and Downloading Google Images/Image Links
#Import Libraries
#coding: UTF-8
import time #Importing the time library to check the time of code execution
import sys #Importing the System Library
import os
import urllib2
########### Edit From Here ###########
#This list is used to search keywords. You can edit this list to search for google images of your choice. You can simply add and remove elements of the list.
search_keyword = ['Australia']
#This list is used to further add suffix to your search term. Each element of the list will help you download 100 images. First element is blank which denotes that no suffix is added to the search keyword of the above list. You can edit the list by adding/deleting elements from it.So if the first element of the search_keyword is 'Australia' and the second element of keywords is 'high resolution', then it will search for 'Australia High Resolution'
keywords = [' high resolution']
########### End of Editing ###########
#Downloading entire Web Document (Raw Page Content)
def download_page(url):
version = (3,0)
cur_version = sys.version_info
if cur_version >= version: #If the Current Version of Python is 3.0 or above
import urllib.request #urllib library for Extracting web pages
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = str(resp.read())
return respData
except Exception as e:
print(str(e))
else: #If the Current Version of Python is 2.x
import urllib2
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req)
page = response.read()
return page
except:
return"Page Not found"
#Finding 'Next Image' from the given raw page
def _images_get_next_item(s):
start_line = s.find('rg_di')
if start_line == -1: #If no links are found then give an error!
end_quote = 0
link = "no_links"
return link, end_quote
else:
start_line = s.find('"class="rg_meta"')
start_content = s.find('"ou"',start_line+1)
end_content = s.find(',"ow"',start_content+1)
content_raw = str(s[start_content+6:end_content-1])
return content_raw, end_content
#Getting all links with the help of '_images_get_next_image'
def _images_get_all_items(page):
items = []
while True:
item, end_content = _images_get_next_item(page)
if item == "no_links":
break
else:
items.append(item) #Append all the links in the list named 'Links'
time.sleep(0.1) #Timer could be used to slow down the request for image downloads
page = page[end_content:]
return items
############## Main Program ############
t0 = time.time() #start the timer
#Download Image Links
i= 0
while i<len(search_keyword):
items = []
iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
print (iteration)
print ("Evaluating...")
search_keywords = search_keyword[i]
search = search_keywords.replace(' ','%20')
#make a search keyword directory
try:
os.makedirs(search_keywords)
except OSError, e:
if e.errno != 17:
raise
# time.sleep might help here
pass
j = 0
while j<len(keywords):
pure_keyword = keywords[j].replace(' ','%20')
url = 'https://www.google.com/search?q=' + search + pure_keyword + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
raw_html = (download_page(url))
time.sleep(0.1)
items = items + (_images_get_all_items(raw_html))
j = j + 1
#print ("Image Links = "+str(items))
print ("Total Image Links = "+str(len(items)))
print ("\n")
#This allows you to write all the links into a test file. This text file will be created in the same directory as your code. You can comment out the below 3 lines to stop writing the output to the text file.
info = open('output.txt', 'a') #Open the text file called database.txt
info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n") #Write the title of the page
info.close() #Close the file
t1 = time.time() #stop the timer
total_time = t1-t0 #Calculating the total time required to crawl, find and download all the links of 60,000 images
print("Total time taken: "+str(total_time)+" Seconds")
print ("Starting Download...")
## To save imges to the same directory
# IN this saving process we are just skipping the URL if there is any error
k=0
errorCount=0
while(k<len(items)):
from urllib2 import Request,urlopen
from urllib2 import URLError, HTTPError
try:
req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
response = urlopen(req,None,15)
output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')
data = response.read()
output_file.write(data)
response.close();
print("completed ====> "+str(k+1))
k=k+1;
except IOError: #If there is any IOError
errorCount+=1
print("IOError on image "+str(k+1))
k=k+1;
except HTTPError as e: #If there is any HTTPError
errorCount+=1
print("HTTPError"+str(k))
k=k+1;
except URLError as e:
errorCount+=1
print("URLError "+str(k))
k=k+1;
i = i+1
print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")
#----End of the main program ----#
# In[ ]:
wordpress script:
import urllib
from wordpress_xmlrpc import Client, WordPressPost
from wordpress_xmlrpc.methods import posts
import xmlrpclib
from wordpress_xmlrpc.compat import xmlrpc_client
from wordpress_xmlrpc.methods import media, posts
import os
########################### Read Me First ###############################
'''
------------------------------------------In DETAIL--------------------------------
Description
===========
Add new posts to WordPress remotely using Python using XMLRPC library provided by the WordPress.
Installation Requirement
************************
Verify you meet the following requirements
==========================================
Install Python 2.7 (Don't download 3+, as most libraries dont yet support version 3).
Install from PyPI using easy_install python-wordpress-xmlrpc
Easy_Install Link: https://pypi.python.org/pypi/setuptools
==========================================
Windows Installation Guide
==========================
-Download and Install Easy_Install from above Link -Extract Downloaded File and from CMD go to the extracted directory and run 'python setup.py install'. This will install easy_install. -Go to %/python27/script and run following command easy_install python-wordpress-xmlrpc
Ubuntu Installation Guide
=========================
sudo apt-get install python-setuptools
sudo easy_install python-wordpress-xmlrpc
Note: Script has its dummy data to work initially which you can change or integrate with your code easily for making it more dynamic.
****************************************
For Bugs/Suggestions
contact#waqasjamal.com
****************************************
------------------------------------------In DETAIL--------------------------------
'''
class Custom_WP_XMLRPC:
def post_article(self,wpUrl,wpUserName,wpPassword,articleTitle, articleCategories, articleContent, articleTags,PhotoUrl):
self.path=os.getcwd()+"\\00000001.jpg"
self.articlePhotoUrl=PhotoUrl
self.wpUrl=wpUrl
self.wpUserName=wpUserName
self.wpPassword=wpPassword
#Download File
f = open(self.path,'wb')
f.write(urllib.urlopen(self.articlePhotoUrl).read())
f.close()
#Upload to WordPress
client = Client(self.wpUrl,self.wpUserName,self.wpPassword)
filename = self.path
# prepare metadata
data = {'name': 'picture.jpg','type': 'image/jpg',}
# read the binary file and let the XMLRPC library encode it into base64
with open(filename, 'rb') as img:
data['bits'] = xmlrpc_client.Binary(img.read())
response = client.call(media.UploadFile(data))
attachment_id = response['id']
#Post
post = WordPressPost()
post.title = articleTitle
post.content = articleContent
post.terms_names = { 'post_tag': articleTags,'category': articleCategories}
post.post_status = 'publish'
post.thumbnail = attachment_id
post.id = client.call(posts.NewPost(post))
print 'Post Successfully posted. Its Id is: ',post.id
#########################################
# POST & Wp Credentials Detail #
#########################################
#Url of Image on the internet
ariclePhotoUrl='http://i1.tribune.com.pk/wp-content/uploads/2013/07/584065-twitter-1375197036-960-640x480.jpg'
# Dont forget the /xmlrpc.php cause thats your posting adress for XML Server
wpUrl='http://YourWebSite.com/xmlrpc.php'
#WordPress Username
wpUserName='WordPressUsername'
#WordPress Password
wpPassword='YourWordPressPassword'
#Post Title
articleTitle='Testing Python Script version 3'
#Post Body/Description
articleContent='Final .... Testing Fully Automated'
#list of tags
articleTags=['code','python']
#list of Categories
articleCategories=['language','art']
#########################################
# Creating Class object & calling the xml rpc custom post Function
#########################################
xmlrpc_object = Custom_WP_XMLRPC()
#On Post submission this function will print the post id
xmlrpc_object.post_article(wpUrl,wpUserName,wpPassword,articleTitle, articleCategories, articleContent, articleTags,ariclePhotoUrl)
for exemple :
search_keyword = ['Australia']
donwload a 20 image frome Google image and update to wordpress like a new posts
#########################################
# POST & Wp Credentials Detail #
#########################################
#Url of Image on the internet
ariclePhotoUrl='https://www.google.com/search?q=' search_keyword +'&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
# Dont forget the /xmlrpc.php cause thats your posting adress for XML Server
wpUrl='http://YourWebSite.com/xmlrpc.php'
#WordPress Username
wpUserName='WordPressUsername'
#WordPress Password
wpPassword='YourWordPressPassword'
#Post Title
articleTitle='Testing Python Script version 3'
#Post Body/Description
articleContent='Final .... Testing Fully Automated'
#list of tags
articleTags=['code','python']
#list of Categories
articleCategories=['language','art']
#########################################
# Creating Class object & calling the xml rpc custom post Function
#########################################
xmlrpc_object = Custom_WP_XMLRPC()
#On Post submission this function will print the post id
xmlrpc_object.post_article(wpUrl,wpUserName,wpPassword,articleTitle, articleCategories, articleContent, articleTags,ariclePhotoUrl)

How explore and download files from web page python3

I have created this GitHub repository with each python version code in case that you guys would like to check the current code.
https://github.com/AndresUrregoAngel/Python3-request/tree/current
I have a script in Python 2 to log in, explore and download a couple of files from a page using lib2 and other modules. I would like to migrate this script to python v3 using request module. Unfortunately, I've not achieved that because I can't keep the session open to explore the page and then download files. Please check the scripts and let me know how could I figure it out.
PythonV2
import os
import time
import urllib
import urllib2
import cookielib
import datetime
# Here are your queue names
QUEUES = {'dis0003': ['dis0003-xxxxxxx', 'dis0003-yyyyyy',
'dis0003-zzzzzzzz'],
'dis0006': ['dis0006-xxxxxxx', 'dis0006-yyyyyyyy',
'dis0006-zzzzzzzz',
'dis0006-mmmmmmm',
'dis0006-nnnnnnnnnn']}
# Your admin login and password
LOGIN = "xxx"
PASSWORD = "xxxxx"
ROOT = "https://xxxxx"
# The client have to take care of the cookies.
jar = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar))
# POST login query on '/login_handler' (post data are: 'login' and 'password').
req = urllib2.Request(ROOT + "/login_handler",
urllib.urlencode({'login': LOGIN,
'password': PASSWORD}))
opener.open(req)
# Set the right accountcode
for accountcode, queues in QUEUES.items():
req = urllib2.Request(ROOT + "/switch_to/" + accountcode)
opener.open(req)
NOW = datetime.datetime.now()
YEAR = NOW.year
FROM_MONTH = NOW.month
TO_MONTH = NOW.month
FROM_DAY = NOW.day
TO_DAY = NOW.day
from_ts = time.mktime(datetime.datetime(
YEAR, FROM_MONTH, FROM_DAY).timetuple())
to_ts = time.mktime(datetime.datetime(
YEAR, TO_MONTH, TO_DAY).timetuple())
# Get the CSV and write it to files
for queue in queues:
url = "%s/queue/csv/stats/%s/%s/%s" % (
ROOT, queue, int(from_ts), int(to_ts))
sections = []
section = []
for line in opener.open(urllib2.Request(url)).read().split('\n'):
if line:
section.append(line)
else:
sections.append(section)
section = []
if section:
sections.append(section)
for i, section in enumerate(sections):
open(os.path.join("file", "%s-%d.csv" % (queue, i + 1)),
"wb").write('\n'.join(section))

How to remove completed torrent using libtorrent rasterbar python binding?

I have a python script that downloads files using libtorrent python binding. I just want to know how to remove the torrent once the download is complete.
I'm posting here the example script I used to make mine (I'm not posting mine because it's too large, it has database parts).
import libtorrent as lt
import time
ses = lt.session()
params = { 'save_path': '/home/downloads/'}
link = "magnet:?xt=urn:btih:4MR6HU7SIHXAXQQFXFJTNLTYSREDR5EI&tr=http://tracker.vodo.net:6970/announce"
handle = lt.add_magnet_uri(ses, link, params)
print 'downloading metadata...'
while (not handle.has_metadata()): time.sleep(1)
print 'got metadata, starting torrent download...'
while (handle.status().state != lt.torrent_status.seeding):
print '%d %% done' % (handle.status().progress*100)
time.sleep(1)
Thanks.
you call remove_torrent() on the session object, passing in the torrent_handle to remove.
http://libtorrent.org/reference-Core.html#remove_torrent()
In your script:
ses.remove_torrent(handle)

Slow downloading with Python Script & TOR (Source Code included)

I am trying to Download html pages with my python script & TOR proxy server. It is running well. But extremely slow & Code is not organized so my IP is renewing most of the time rather downloading pages much. How can I speed the downloading with TOR? How can I organize the code efficiency.
Two script is there. Script1 is executed to download html pages from the website & after get block from the website, Script2 has to be executed to renew the IP with help of TOR proxy. So on... IP gets blocked after few seconds.
Should I lower my threading? How ? Please help me to speed up the process. I am getting only 300-500 html pages per hour.
Here is my Full Code of Script1:
# -*- coding: UTF-8 -*-
import os
import sys
import socks
import socket
import subprocess
import time
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS4, '127.0.0.1', 9050, True)
socket.socket = socks.socksocket
import urllib2
class WebPage:
def __init__(self, path, country, url, lower=0,upper=9999):
self.dir = str(path)+"/"+ str(country)
self.dir =os.path.join(str(path),str(country))
self.url = url
try:
fin = open(self.dir+"/limit.txt",'r')
limit = fin.readline()
limits = str(limit).split(",")
lower = int(limits[0])
upper = int(limits[1])
fin.close()
except:
fout = open(self.dir+"/limit.txt",'wb')
limits = str(lower)+","+str(upper)
fout.write(limits)
fout.close()
self.process_instances(lower,upper)
def process_instances(self,lower,upper):
try:
os.stat(self.dir)
except:
os.mkdir(self.dir)
for count in range(lower,upper+1):
if count == upper:
print "all downloaded, quitting the app!!"
break
targetURL = self.url+"/"+str(count)
print "Downloading :" + targetURL
req = urllib2.Request(targetURL)
try:
response = urllib2.urlopen(req)
the_page = response.read()
if the_page.find("Your IP suspended")>=0:
print "The IP is suspended"
fout = open(self.dir+"/limit.txt",'wb')
limits = str(count)+","+str(upper)
fout.write(limits)
fout.close()
break
if the_page.find("Too many requests")>=0:
print "Too many requests"
print "Renew IP...."
fout = open(self.dir+"/limit.txt",'wb')
limits = str(count)+","+str(upper)
fout.write(limits)
fout.close()
subprocess.Popen("C:\Users\John\Desktop\Data-Mine\yp\lol\lol2.py", shell=True)
time.sleep(2)
subprocess.call('lol1.py')
if the_page.find("404 error")>=0:
print "the page not exist"
continue
self.saveHTML(count, the_page)
except:
print "The URL cannot be fetched"
execfile('lol1.py')
pass
#continue
raise
def saveHTML(self,count, content):
fout = open(self.dir+"/"+str(count)+".html",'wb')
fout.write(content)
fout.close()
if __name__ == '__main__':
if len(sys.argv) !=6:
print "cannot process!!! Five Parameters are required to run the process."
print "Parameter 1 should be the path where to save the data, eg, /Users/john/data/"
print "Parameter 2 should be the name of the country for which data is collected, eg, japan"
print "Parameter 3 should be the URL from which the data to collect, eg, the website link"
print "Parameter 4 should be the lower limit of the company id, eg, 11 "
print "Parameter 5 should be the upper limit of the company id, eg, 1000 "
print "The output will be saved as the HTML file for each company in the target folder's country"
exit()
else:
path = str(sys.argv[1])
country = str(sys.argv[2])
url = str(sys.argv[3])
lowerlimit = int(sys.argv[4])
upperlimit = int(sys.argv[5])
WebPage(path, country, url, lowerlimit,upperlimit)
TOR is very slow, so it is to be expected that you don't get that much pages per hour. There are however some ways to speed it up. Most notably you could turn on GZIP compression for urllib (see this question for example) to improve the speed a little bit.
TOR as a protocol has rather low bandwidth, because the data needs to be relayed a few times and each relay must use its bandwidth for your request. If data is relayed 6 times - a rather probable number - you would need 6 times the bandwidth. GZIP compression can compress HTML to (in some cases) ~10% of the original size so that will probably speed up the process.

Downloading Links with Python

I have two sets of scripts. One to download a webpage and another to download links from the webpage. They both run but the links script doesn't return any scripts. Can anyone see or tell me why?
webpage script;
import sys, urllib
def getWebpage(url):
print '[*] getWebpage()'
url_file = urllib.urlopen(url)
page = url_file.read()
return page
def main():
sys.argv.append('http://www.bbc.co.uk')
if len(sys.argv) != 2:
print '[-] Usage: webpage_get URL'
return
else:
print getWebpage(sys.argv[1])
if __name__ == '__main__':
main()
Links Script
import sys, urllib, re
import getWebpage
def print_links(page):
print '[*] print_links()'
links = re.findall(r'\<a.*href\=.*http\:.+', page)
links.sort()
print '[+]', str(len(links)), 'HyperLinks Found:'
for link in links:
print link
def main():
sys.argv.append('http://www.bbc.co.uk')
if len(sys.argv) != 2:
print '[-] Usage: webpage_links URL'
return
page = webpage_get.getWebpage(sys.argv[1])
print_links(page)
This will fix most of your problems:
import sys, urllib, re
def getWebpage(url):
print '[*] getWebpage()'
url_file = urllib.urlopen(url)
page = url_file.read()
return page
def print_links(page):
print '[*] print_links()'
links = re.findall(r'\<a.*href\=.*http\:.+', page)
links.sort()
print '[+]', str(len(links)), 'HyperLinks Found:'
for link in links:
print link
def main():
site = 'http://www.bbc.co.uk'
page = getWebpage(site)
print_links(page)
if __name__ == '__main__':
main()
Then you can move on to fixing your regular expression.
While we are on the topic, though, I have two material recommendations:
use python library requests for getting web pages
use a real XML/HTML library for parsing HTML (recommend lxml)
Your regular expression doesn't have an end, so when you find the first it will display you the entire rest of page as you use the http\:.+ which means return all what is : till the end of the html page you need to specify the as end of the regular expression

Categories