python requests-html Chromium process leaking - python

My program cannot run thought the entire loop because a leak crashes it before it gets to the end.
I have the following script:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import requests
for x in range(9376,23534):
session = HTMLSession()
r = session.get('https://someexampleurl.com/yadayada/database1/{}'.format(x))
r.html.render() # this call executes the js in the page
soup = BeautifulSoup(r.html.html, features="lxml")
r.close()
print(x)
name = "\n".join([img['alt'] for img in soup.find_all('img', alt=True)])
name = name[1:]
name = name[:-1]
url = "\n".join([img['src'] for img in soup.find_all('img', alt=True)])
def solve_fast(s):
ind1 = s.find('\n')
ind2 = s.rfind('\n')
return s[ind1+1:ind2]
url = solve_fast(url)
url = url[0:41] + "1" + url[41+1: ]
url = url[0:42] + "2" + url[42+1: ]
url = url[0:43] + "8" + url[43+1: ]
img_data = requests.get(url)
with open('local_database1/{}{}.avif'.format(x,name), 'wb') as handler:
handler.write(img_data.content)
img_data.close()
When ran in a loop the chromium process stacks up infinitely until the program crashes, I can't see where I am not closing the connection to the request.

In my case session.close() works for me.
Code
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://xxxxxxxx')
r.html.render()
...
session.close()

Related

How to crawl images inside links of a page

I need the crawler to go to the links inside a website and scan images there. I've managed to get this far but I'm confused.
I'm trying to do something like this but I'm sure there's gonna be an easier way.
from bs4 import *
import requests as rq
import os
import sys
from urllib.parse import urlparse
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
images = []
link_urls = soup2.select('a')
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
print(img['src'])
return img['src']
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
# print(link)
if url_validator(link):
crawl_images(link)
I try python3 new_crawler.py imdb.com 3 which should print sources of images crawled in 3 links inside imdb.com but it's not printing anything.
You want to crawl through the images, correct? Try this:
from bs4 import BeautifulSoup
import requests as rq
URL = ""
source = rq.get(URL)
soup = BeautifulSoup(source.text, "html.parser")
image_links = soup.find_all("img")
for img in image_links:
print(img['src'])
Add the website's url to the constant URL that you are trying to scrap. The page's img tags should all be saved in the image_links variable.
This is what I ended up with. It's not working how it's supposed to but the time for the task is up and I decided to share anyway.
from bs4 import *
import requests as rq
import sys
from urllib.parse import urlparse
import json
page_url = sys.argv[1]
depth = int(sys.argv[2])
crawl = str(page_url)
r2 = rq.get('https://www.' + crawl + '' + '/')
soup2 = BeautifulSoup(r2.text, "html.parser")
link_urls = soup2.select('a')
links = []
images_sources = []
def url_validator(link):
try:
result = urlparse(link)
return all([result.scheme, result.netloc])
except:
return False
def crawl_images(link):
requested_link = rq.get(link)
images = BeautifulSoup(requested_link.text, "html.parser")
image = images.select('img')
for img in image:
images_sources.append(img['src'])
results = {
"imageUrl": img['src'],
"sourceUrl": link,
"depth": depth
}
json_object = json.dumps(results)
with open("results.json", "w") as f:
f.write(json_object)
f.close()
return results
for link_url in link_urls[:depth]:
links.append(link_url['href'])
for link in links:
if url_validator(link):
crawl_images(link)

Python : How to stay logged in while scraping?

Just to clarify from the beginning: I'm a total beginner (I wrote something in Python for the first time today). This was more applying from a guide and trying to remember what I did 7 years ago when I tried learning java than anything else.
I wanted to scrape the image tags from a website (to plot them later) but have to stay logged in to view all images. After I got the scraping down I noticed that there were some tags blocked so the issue with the login came up. I now managed to log in but it doesn't work outside of the session itself which makes the rest of my code useless. Can I get this to work or do I have to give up?
This is the working login:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
login_data = {
'user' : 'theusername',
'pass' : 'thepassword',
'op' : 'Log in'
}
with requests.Session() as s:
url = "https://thatwebsite.com/index.php?page=account&s=login&code=00"
r = s.get(url)
r = s.post(url, data=login_data)
And what I had working before to scrape the website but with the login missing:
filename = "taglist.txt"
f = open(filename, "w", encoding="utf-8")
headers = "tags\n"
f.write(headers)
pid = 0
actual_page = 1
while pid < 150:
url = "https://thatwebsite.com/index.php?page=post&s=list&tags=absurdres&pid=" + str(pid)
print(url)
client = urlopen(url)
page_html = client.read()
client.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"thumbnail-preview"})
print("Current pid: " + str(pid))
for container in containers:
tags = container.span.a.img["title"]
f.write(tags.replace(" ", "\n") + "\n")
pid = pid + 42
print("Current page: " + str(actual_page))
actual_page += 1
print("Done.")
f.close()
Out comes a list of every tag used by high res images.
I hope I don't offend anyone with this.
Edit: The code is working now, had a cookie typo:
import requests
from bs4 import BeautifulSoup as soup
login_data = {
'user' : 'myusername',
'pass' : 'mypassword',
'op' : 'Log in'
}
s = requests.Session()
print("\n\n\n\n\n")
filename = "taglist.txt"
f = open(filename, "w", encoding="utf-8")
headers = "tags\n"
f.write(headers)
pid = 0
actual_page = 1
while pid < 42:
url2 = "https://thiswebsite.com/index.php?page=post&s=list&tags=rating:questionable&pid=" + str(pid)
r = s.get(url2, cookies={'duid' : 'somehash', 'user_id' : 'my userid', 'pass_hash' : 'somehash'})
page_html = str(r.content)
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"thumbnail-preview"})
for container in containers:
tags = container.span.a.img["title"]
f.write(tags.replace(" ", "\n") + "\n")
print("\nCurrent page: " + str(actual_page) + " Current pid: " + str(pid) + "\nDone.")
actual_page += 1
pid = pid + 42
f.close()
You use two different libraries for doing web requests right now. requests and urllib. I would opt for using only requests.
Also don't use the Session() context manager. Context manager are used to do some cleanup after leaving the indented block and have that with ... as x syntax you use on the requests.Session() object. In context of requests this will clear the cookies as you leave the session. (I assume login is managed by cookies at this site).
Keep the session in a variable instead that you can use for subsequent requests as this stores your cookies at login. You need them for subsequent requests.
s = requests.Session()
url = "https://thatwebsite.com/index.php?page=account&s=login&code=00"
r = s.get(url) # do you need this request?
r = s.post(url, data=login_data)
Also make the subsequent call in the loop with requests:
client = s.get(url)

Extract .jpg from HTML source code with Python

I set up this code to extract the links from the following website. The problem is that it breaks into register 19 and doesn't continue with the listing.
You can help me.
import urllib.request
import os
tematica = 'fun'
url = "https://www.shutterstock.com/es/search/" + tematica
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)
data_content = response.read()
Html_file= open("html_file.html","wb")
Html_file.write(data_content)
Html_file.close()
html=codecs.open("html_file.html", 'r', 'utf-8').read()
soup = BeautifulSoup(html)
for i,img_element in enumerate(soup.findAll('img', None)):
try:
img_src = img_element['src']
print(i,img_src)
except:
pass

Crawling over a website directories using BeautifulSoup?

This is my code:
https://pastebin.com/R11qiTF4
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as req
from urllib.parse import urljoin
import re
urls = ["https://www.helios-gesundheit.de"]
domain_list = ["https://www.helios-gesundheit.de/kliniken/schwerin/"]
prohibited = ["info", "news"]
text_keywords = ["Helios", "Helios"]
url_list = []
desired = "https://www.helios-gesundheit.de/kliniken/schwerin/unser-angebot/unsere-fachbereiche-klinikum/allgemein-und-viszeralchirurgie/team-allgemein-und-viszeralchirurgie/"
for x in range(len(domain_list)):
url_list.append(urls[x]+domain_list[x].replace(urls[x], ""))
print(url_list)
def prohibitedChecker(prohibited_list, string):
for x in prohibited_list:
if x in string:
return True
else:
return False
break
def parseHTML(url):
requestHTML = req(url)
htmlPage = requestHTML.read()
requestHTML.close()
parsedHTML = soup(htmlPage, "html.parser")
return parsedHTML
searched_word = "Helios"
for url in url_list:
parsedHTML = parseHTML(url)
href_crawler = parsedHTML.find_all("a", href=True)
for href in href_crawler:
crawled_url = urljoin(url,href.get("href"))
print(crawled_url)
if "www" not in crawled_url:
continue
parsedHTML = parseHTML(crawled_url)
results = parsedHTML.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
for single_result in results:
keyword_text_check = prohibitedChecker(text_keywords, single_result.string)
if keyword_text_check != True:
continue
print(single_result.string)
I'm trying to print the contents of ''desired'' variable. The problem is the following, my code doesn't even get to request the URL of ''desired'' because its not in the website scope. ''desired'' href link is inside another href link that's inside the page I'm currently scraping. I thought I'd fix this by adding another for loop inside line 39 for loop, that requests every href found in my first, but this is too messy and not efficient
Is there way to get a list of every directory of a website url?

How to put the image files I scraped using Beautiful soup into a list?

This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.

Categories