Code:
import requests as rq
from bs4 import BeautifulSoup as bs
url = "https://apod.nasa.gov/apod/astropix.html"
page = rq.get(url).content
soup = bs(page, 'html.parser')
response = soup.find('img')
if response == None:
imglink = soup.find('iframe')['src']
else:
imglink = 'https://apod.nasa.gov/apod/' + response['src']
def main():
sess = rq.Session()
cid='**************'
turl = 'https://api.telegram.org/bot*******************/'
if response == None:
imglink = soup.find('iframe')['src']
params = {'chat_id':cid,'text':imglink}
sess.post(turl + 'sendMessage', data=params)
else:
imglink = 'https://apod.nasa.gov/apod/' + response['src']
title = soup.find('b').get_text()
params = {'chat_id':cid,'photo':imglink,'caption':title}
sess.post(turl + 'sendPhoto', data=params)
if __name__ == '__main__':
main()
This is a simple bot for sending Nasa picture to my telegram channel. I will be modifying this script to make it happen everyday. But he Question is where do I host them, So that it will run all the time (free) . What is the correct way of doing it.
I don't know of any providers that would host this for free. For cheap, AWS and Google Cloud all have simple solutions.
Ex: https://cloud.google.com/blog/products/application-development/how-to-schedule-a-recurring-python-script-on-gcp
Related
My program cannot run thought the entire loop because a leak crashes it before it gets to the end.
I have the following script:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import requests
for x in range(9376,23534):
session = HTMLSession()
r = session.get('https://someexampleurl.com/yadayada/database1/{}'.format(x))
r.html.render() # this call executes the js in the page
soup = BeautifulSoup(r.html.html, features="lxml")
r.close()
print(x)
name = "\n".join([img['alt'] for img in soup.find_all('img', alt=True)])
name = name[1:]
name = name[:-1]
url = "\n".join([img['src'] for img in soup.find_all('img', alt=True)])
def solve_fast(s):
ind1 = s.find('\n')
ind2 = s.rfind('\n')
return s[ind1+1:ind2]
url = solve_fast(url)
url = url[0:41] + "1" + url[41+1: ]
url = url[0:42] + "2" + url[42+1: ]
url = url[0:43] + "8" + url[43+1: ]
img_data = requests.get(url)
with open('local_database1/{}{}.avif'.format(x,name), 'wb') as handler:
handler.write(img_data.content)
img_data.close()
When ran in a loop the chromium process stacks up infinitely until the program crashes, I can't see where I am not closing the connection to the request.
In my case session.close() works for me.
Code
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://xxxxxxxx')
r.html.render()
...
session.close()
anyone can please explain me how i can split results for get just simple url and response?
I have try so many time but nothing, for now i can print just like:
50
0.4110674999999999
........, [<Response [200]>], [<Response [200]>], [<Response [200]>]]
[......, ['http://example.com.com/catalogue/page-48.html'], ['http://example.com.com/catalogue/page-49.html'], ['http://example.com.com/catalogue/page-50.html']]
I need like
<Response [200]>
https://example.com/
Thanks so much.
Ps. Also why after installing module grequests I get this message on the console
C:\P3\lib\site-packages\grequests.py:22: MonkeyPatchWarning: Monkey-patching ssl after ssl has already been imported may lead to errors, including RecursionError on Python 3.6. It may also silently lead to incorrect behaviour on Python 3.7. Please monkey-patch earlier. See https://github.com/gevent/gevent/issues/1016. Modules that had direct imports (NOT patched): ['urllib3.util.ssl_ (C:\\P3\\lib\\site-packages\\urllib3\\util\\ssl_.py)', 'urllib3.util (C:\\P3\\lib\\site-packages\\urllib3\\util\\__init__.py)'].
curious_george.patch_all(thread=False, select=False)
How I can fix it ? Uninstall complete python, install some patch or what ?
Thanks!
import grequests
from bs4 import BeautifulSoup
import time
def get_urls():
urls = []
for x in range(1,51):
urls.append(f'http://books.toscrape.com/catalogue/page-{x}.html')
return urls
def get_data(urls):
reqs = [grequests.get(link) for link in urls]
resp = grequests.map(reqs)
return resp
if __name__ == '__main__':
start = time.perf_counter()
urls = get_urls()
url = len(get_urls())
resp = get_data(urls)
respo = len(get_data(urls))
fin = time.perf_counter() - start
resp_list = resp
chunked_resp = list()
chunk_size = respo
urls_list = urls
chunked_url = list()
chunk_size = url
print(urls)
print(url)
print(resp)
print(respo)
print(fin)
resp_list = resp
chunked_resp = list()
chunk_size = 1
for i in range(0, len(resp_list), chunk_size):
chunked_resp.append(resp_list[i:i+chunk_size])
print(chunked_resp)
urls_list = urls
chunked_url = list()
chunk_size = 1
for i in range(0, len(urls_list), chunk_size):
chunked_url.append(urls_list[i:i+chunk_size])
print(chunked_url)
OK i have get a solution only for print url:
def get_data(urls):
reqs = [grequests.get(link) for link in urls]
resp = grequests.get(reqs)
return resp
if __name__ == '__main__':
start = time.perf_counter()
urls = get_urls()
resp = get_data(urls)
resp = '\n'.join(resp)
url = '\n'.join(urls)
http://books.toscrape.com/catalogue/page-48.html
http://books.toscrape.com/catalogue/page-49.html
http://books.toscrape.com/catalogue/page-50.html
resp = '\n'.join(resp)
TypeError: can only join an iterable
But for a resp i get TypeError: can only join an iterable
Ps. i am started learn python max 1 month... :(
I am new to the web-scraping and making an API, facing an error while scraping an e-commerce website. Below is my python code please guide me through the same, I am getting "The requested URL was not found on the server." while running on a local-host.
from flask import Flask , request , jsonify
from bs4 import BeautifulSoup
import requests
app = Flask(__name__)
#app.route('/',methods=['GET'])
def API():
if request.method == 'GET':
uri = 'https://www.flipkart.com'
query = str(request.args['query'])
print(query)
if " " in query:
query = str(query).replace(" ","+")
else:
pass
search = '/search?q=' + query
ready_uri = uri + search
print(ready_uri)
content = requests.get(ready_uri).content
soup = BeautifulSoup(content, 'html.parser')
quotes_links = soup.find_all('a', {'class': '_3O0U0u'})
l = []
for i in quotes_links:
d = {}
quote_url = uri + i.get('href')
quote_content = requests.get(quote_url).content
quote_soup = BeautifulSoup(quote_content, 'html.parser')
d['quote'] = quote_soup.find('p', {'class': '_3wU53n'}).text
d['author'] = quote_soup.find('p', {'class': '_1vC4OE _2rQ-NK'}).text
l.append(d)
return jsonify(l)
if __name__ == '__main__':
app.run()
Error:
[33mGET /search?q=books HTTP/1.1[0m" 404 -
How do you get a query string on Flask?
You appear to be getting the query argument incorrectly.
query = str(request.args['query'])
When it should be:
query = str(request.args.get('query'))
Doing so returns a 200 but with blank data. I would suggest looking at the element your scraping:
quotes_links = soup.find_all('a', {'class': '_3O0U0u'})
Once you obtain the correct element with soup, you should start seeing return data.
Just to clarify from the beginning: I'm a total beginner (I wrote something in Python for the first time today). This was more applying from a guide and trying to remember what I did 7 years ago when I tried learning java than anything else.
I wanted to scrape the image tags from a website (to plot them later) but have to stay logged in to view all images. After I got the scraping down I noticed that there were some tags blocked so the issue with the login came up. I now managed to log in but it doesn't work outside of the session itself which makes the rest of my code useless. Can I get this to work or do I have to give up?
This is the working login:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
login_data = {
'user' : 'theusername',
'pass' : 'thepassword',
'op' : 'Log in'
}
with requests.Session() as s:
url = "https://thatwebsite.com/index.php?page=account&s=login&code=00"
r = s.get(url)
r = s.post(url, data=login_data)
And what I had working before to scrape the website but with the login missing:
filename = "taglist.txt"
f = open(filename, "w", encoding="utf-8")
headers = "tags\n"
f.write(headers)
pid = 0
actual_page = 1
while pid < 150:
url = "https://thatwebsite.com/index.php?page=post&s=list&tags=absurdres&pid=" + str(pid)
print(url)
client = urlopen(url)
page_html = client.read()
client.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"thumbnail-preview"})
print("Current pid: " + str(pid))
for container in containers:
tags = container.span.a.img["title"]
f.write(tags.replace(" ", "\n") + "\n")
pid = pid + 42
print("Current page: " + str(actual_page))
actual_page += 1
print("Done.")
f.close()
Out comes a list of every tag used by high res images.
I hope I don't offend anyone with this.
Edit: The code is working now, had a cookie typo:
import requests
from bs4 import BeautifulSoup as soup
login_data = {
'user' : 'myusername',
'pass' : 'mypassword',
'op' : 'Log in'
}
s = requests.Session()
print("\n\n\n\n\n")
filename = "taglist.txt"
f = open(filename, "w", encoding="utf-8")
headers = "tags\n"
f.write(headers)
pid = 0
actual_page = 1
while pid < 42:
url2 = "https://thiswebsite.com/index.php?page=post&s=list&tags=rating:questionable&pid=" + str(pid)
r = s.get(url2, cookies={'duid' : 'somehash', 'user_id' : 'my userid', 'pass_hash' : 'somehash'})
page_html = str(r.content)
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"thumbnail-preview"})
for container in containers:
tags = container.span.a.img["title"]
f.write(tags.replace(" ", "\n") + "\n")
print("\nCurrent page: " + str(actual_page) + " Current pid: " + str(pid) + "\nDone.")
actual_page += 1
pid = pid + 42
f.close()
You use two different libraries for doing web requests right now. requests and urllib. I would opt for using only requests.
Also don't use the Session() context manager. Context manager are used to do some cleanup after leaving the indented block and have that with ... as x syntax you use on the requests.Session() object. In context of requests this will clear the cookies as you leave the session. (I assume login is managed by cookies at this site).
Keep the session in a variable instead that you can use for subsequent requests as this stores your cookies at login. You need them for subsequent requests.
s = requests.Session()
url = "https://thatwebsite.com/index.php?page=account&s=login&code=00"
r = s.get(url) # do you need this request?
r = s.post(url, data=login_data)
Also make the subsequent call in the loop with requests:
client = s.get(url)
I am attempting to use the Google UrlShortener API to retrieve history with OAuth2 and an API key. I am getting a 200 OK response but when I try and get subsequent pages using pagetoken or pageToken as a query parameter I always get the same nextPageToken and the same page of results. Oddly, the browser based Google API interaction uses start-token not pagetoken or pageToken but when I use start-token I don't get a 200 OK.
How do I get pagination to work with the UrlShortener API?
Here is my code:
import requests
import json
import time
import settings
from oauth2client.client import OAuth2WebServerFlow
from oauth2client.tools import run_flow
from oauth2client.file import Storage
def history():
"""Look up a user's history"""
flow = OAuth2WebServerFlow(client_id=settings.OAUTH2_CLIENT_ID,
client_secret=settings.CLIENT_SECRET,
scope='https://www.googleapis.com/auth/urlshortener',
redirect_uri='http://127.0.0.1:5000/callback')
storage = Storage('creds.data')
credentials = run_flow(flow, storage)
print("access_token: {}".format(credentials.access_token))
headers = {'Content-Type': 'application/json', 'Authorization': 'Bearer {}'.format(credentials.access_token)}
raw_url = 'https://www.googleapis.com/urlshortener/v1/url/history'
url = raw_url + '?key={}'.format(settings.API_KEY)
r = requests.get(url=url, headers=headers)
if r.ok:
output = "The history is {}.".format(r.json())
print(output)
if 'nextPageToken' in r.json().keys():
morePages = True
npt = r.json()['nextPageToken']
r_paged = None
while morePages:
time.sleep(2)
url = raw_url + '?pagetoken={}&key={}'.format(npt, settings.API_KEY)
r_paged = requests.get(url=url, headers=headers)
if r_paged.ok:
if 'nextPageToken' in r_paged.json().keys():
npt = r_paged.json()['nextPageToken']
morePages = True
else:
morePages = False
break
output = "The history is {}.".format(r_paged.json())
print(output)
else:
output = "Invalid request. Status code = {}, json = {}".format(r_paged.status_code, r_paged.json())
print(output)
else:
output = "Invalid request. Status code = {}, json = {}".format(r.status_code, r.json())
print(output)
Fixed code follows:
# New import:
import urllib.parse
# // snip
time.sleep(2)
f = {'start-token':npt, 'key': settings.API_KEY}
formatted = '?' + urllib.parse.urlencode(f)
url = raw_url + formatted
r_paged = requests.get(url=url, headers=headers)
# // snip
Basically, ignore the documentation. Do NOT use pageToken, use start-token. Furthermore, you need to use the url parser suitable for Python 3 for urlencoding.