access list in a 2x-nested function - python

def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse_main(html):
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
for a_tag in table.find_all('a', class_='all'):
parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href']))
def parse_movie(html):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
movies.append(info.text)
def main():
movies = []
parse_main(get_html('https://www.somerandommovieswebsite.com'))
print(movies)
if __name__ == '__main__':
main()
How do I access the movies list (that is defined in main() function) in parse_movie which is nested in parse_main. Can't append anything to the list because of "unresolved referrence 'movies'" error. Using nonlocal didn't help

I think you should neither use a global variable here nor pass it as an argument:
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse_main(html):
movies = []
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
for a_tag in table.find_all('a', class_='all'):
movies.append(
parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href']))
)
return movies
def parse_movie(html):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
return info.text
def main():
movies = parse_main(get_html('https://www.somerandommovieswebsite.com'))
print(movies)
if __name__ == '__main__':
main()

There are several ways to do it.
First define globally movies.
Second you can just pass a list as a parameter like that.
Since lists are passed by reference and we are appending the list which is define in main function and we don't need to return to the main function.
def parse_main(html,movies):
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
for a_tag in table.find_all('a', class_='all'):
parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href']),movies)
def parse_movie(html,movies):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
movies.append(info.text)
def main():
movies = []
parse_main(get_html('https://www.somerandommovieswebsite.com'),movies)
print(movies)
Third approach is to make a list inside a function and return it
def parse_main(html):
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
movies = []
for a_tag in table.find_all('a', class_='all'):
movies.append (parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href'])))
return movies
def parse_movie(html):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
return info.text
def main():
movies = parse_main(get_html('https://www.somerandommovieswebsite.com'))
print(movies)

The easiest approach would be using a global variable. But you should avoid using global variables whenever possible. You can change your code something like this and avoid using global variables and passing the variable as parameter.
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse_main(html):
parse_movies = []
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
for a_tag in table.find_all('a', class_='all'):
parse_movies.append(parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href'])))
return movies
def parse_movie(html):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
return info.text
def main():
movies = parse_main(get_html('https://www.somerandommovieswebsite.com'))
print(movies)
if __name__ == '__main__':
main()

Pass the movies list as an argument and avoid using global variables, in most cases it's better.
The issue was that movies was a local variable inside ̀parse_movie, meaning it's a different variable than the one defined in your main.
I simply passed the ̀movies variable from the main function down to the parse_movie one and added return statements.
def get_html(url):
response = urllib.request.urlopen(url)
return response.read()
def parse_main(html):
movies = []
webpage = BeautifulSoup(html, features="html.parser")
table = webpage.find('table', id='itemList')
for a_tag in table.find_all('a', class_='all'):
movies.append(parse_movie(get_html('https://www.somerandommovieswebsite.com' + a_tag['href'])))
return movies
def parse_movie(html):
web_page = BeautifulSoup(html, features="html.parser")
info = web_page.find('h1', class_="moviename")
return info.text
def main():
movies = parse_main(get_html('https://www.somerandommovieswebsite.com'))
print(movies)
if __name__ == '__main__':
main()

movies is a local variable inside your main function, so it's normal your function doesn't find it, either make it global (not always a good idea) or pass it as an argument.

Related

How to get the tokens in data-search-meta-sol

def extract(page):
url = f'https://www.jobstreet.com.my/en/job-search/administrative-assistant-jobs/{page}/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
jobs = soup.find_all('div', class_='sx2jih0 zcydq876 zcydq866 zcydq896 zcydq886 zcydq8n zcydq856 zcydq8f6 zcydq8eu')
for job in jobs[:29]:
for token in job.find_all('div', attrs={'data-search-sol-meta': True}):
more_details = token.text.strip()
job_detail = {
'more details': more_details
}
joblist.append(job_detail)
joblist = []
dummy = 2
for i in range(0, dummy, 1):
c = extract(i + 1)
transform(c)
print(f'Progress Page: [{int(i) + 1}/{dummy}]')
time.sleep(4)
df = pd.DataFrame(joblist)
I want to scrape the tokens in those data-search-sol-meta tags, how to i get it?
<div data-search-sol-meta="{"searchRequestToken":"62781aeb-4a14-43c9-b985-8be617cc1107","token":"0~62781aeb-4a14-43c9-b985-8be617cc1107","jobId":"jobstreet-my-job-5011156","section":"MAIN","sectionRank":1,"jobAdType":"ORGANIC","tags":{"mordor__flights":"mordor_80","jobstreet:userGroup":"BB","jobstreet:s_vi":"[CS]v1|314CC40D0D655F39-400007A66AC825EB[CE]"}}">
the results in the pd (more_details column) that I've got is just "None"
I would use a more robust css selector list i.e. not the dynamic classes. Be high enough in the DOM to be able to select both the attributes you want and then the job info. You can extract the attribute with the tokens and use json library to list separately.
import requests, json
from bs4 import BeautifulSoup
def extract(page):
url = f"https://www.jobstreet.com.my/en/job-search/administrative-assistant-jobs/{page}/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
return soup
def transform(soup):
jobs = soup.select("[data-automation=jobListing] > div:has(article)")
for job in jobs:
print(job.select_one("h1 span").text)
print()
print(job["data-search-sol-meta"])
print()
data = json.loads(job["data-search-sol-meta"])
print("searchRequestToken: ", data["searchRequestToken"])
print("token: ", data["token"])
print()
soup = extract(1)
transform(soup)

Function to get multiple soups only gives one soup

so I am trying to create a function that creates a list of multiple soups. I started with doing it in normal code (I do not know how to call this exactly:
list_url = ["http://www.facebook.com", "https://www.google.com", "http://www.yahoo.com"]
list_soup = []
for url in list_url:
soup = BeautifulSoup(requests.get(url).text, "html.parser")
list_soup.append(soup)
And this code works, but when I a function of this:
def get_multi_soup(list_url):
list_multi = []
for url in list_url:
soup = BeautifulSoup(requests.get(url).text, "html.parser")
list_multi.append(soup)
return list_multi
list_soup = get_multi_soup(list_url)
The code does not work as intended as it only gives one soup instead of three.
Can somebody explain why this does not work? The list_soup equals only one soup.
just move the return outside the for loop.
You are returning on the first iteration.
def get_multi_soup(list_url):
list_multi = []
for url in list_url:
soup = BeautifulSoup(requests.get(url).text, "html.parser")
list_multi.append(soup)
return list_multi
list_soup = get_multi_soup(list_url)
Should do the trick. :)

How do I call a class for use in my main function

I want to use a class in my main function but don't exactly how to do it.
Here's what I have tried so far. The following code is only for demonstration purposes.
Code has been edited to what #furas has suggested.
import requests
from bs4 import BeautifulSoup
from class.embed import Embed
def request(r):
s = requests.Session()
r = s.get(url)
return r
def data(r):
soup = BeautifulSoup(r.txt, 'lxml')
title = soup.select('.ac-ln-title-comingsoon')
return title
def main():
url = 'https://www.apple.com/macbook-pro-16/'
old_title = None
while True:
r = request(url)
title = data(r)
if title != old_title:
url = 'https://www.apple.com/macbook-pro-16/specs/'
embed_class = Embed(url)
print(price, processor)
else:
print('Lorem ipsum')
if __name__ == "__main__":
main()
This is my class:
class Embed:
def __init__(self, url):
self.r = request(url)
def content(r):
soup = BeautifulSoup(self.r.text, 'lxml')
price = soup.select('.column large-6').get_text()
processor = soup.select('.techspecs-column').get_text()
First you have to use self as first argument in all methods. When you will run
embed_class.content()
then Python will runs
Embed.content(embed_class)
so it will assign instance embed_class to self
Second you should use self. to have access to variables from one method in another method.
Method should also use return so you can use its values outside method.
BTW:
select() gives list (even if it find only one element or it doesn't find any element) so you can't use select().get_text() but you have to use get_text() on every element separatelly using for-loop or list comprehension. ie.
price = [item.get_text(strip=True) for item in price]
Page has two prices and two descriptions for processor so you may need for-loop to display it in more readable way. I display it as list.
I skiped while-loop because it was useless for me. I also changed classes in select() to get only some part of page.
import requests
from bs4 import BeautifulSoup
class Embed:
def __init__(self, url):
self.r = request(url)
def content(self):
soup = BeautifulSoup(self.r.text, 'lxml')
price = soup.select('.section-price .column.large-6')
price = [item.get_text(strip=True) for item in price]
processor = soup.select('.section-processor')
processor = [item.get_text(strip=True) for item in processor]
return price, processor
def request(url):
s = requests.Session()
r = s.get(url)
return r
def data(r):
soup = BeautifulSoup(r.text, 'lxml')
title = soup.select('.ac-ln-title-comingsoon')
title = title[0].get_text()
return title
def main():
url = 'https://www.apple.com/macbook-pro-16/'
old_title = None
r = request(url)
title = data(r)
print(title)
if title != old_title:
url = 'https://www.apple.com/macbook-pro-16/specs/'
embed_class = Embed(url) # it runs only `__init__()`
price, processor = embed_class.content()
print(price, processor)
else:
print('Lorem ipsum')
if __name__ == "__main__":
main()
What i understood from your question is that you want to create an object of that class and pass the url in the parametric constructor and use that in your main function?
Below should be your main function:
def main():
url = 'https://www.apple.com/macbook-pro-16/'
old_title = None
while True:
r = request(url)
title = data(r)
if title != old_title:
url = 'https://www.apple.com/macbook-pro-16/specs/'
embed_class = Embed(**url**)
embed_class.content()
print(embed_class.price, embed_class.productData)
else:
print('Lorem ipsum')
Below should be your Embed class:
class Embed:
def __init__(url):
r = request(url)
def content(r):
soup = BeautifulSoup(r.text, 'lxml')
price = soup.select('.column large-6').get_text()
processor = soup.select('.techspecs-column').get_text()
productData= soup.select('#name for extracting productdata#').get_text()
Please don't copy the same code i have written. First try to understand from it. Let me know if any issues. You may have a doubt that how productdata is coming. And that you have to search the query name for productData or else you may know from where you get the productdata from .Rest of the code ,i mean your basic doubt on how to call the class members and use them in main function is clear now.
Thank you.

Using BeautifulSoup to find links related to specific keyword

I have to modify this code so the scraping keeps only the links that contain a specific keyword. In my case I'm scraping a newspaper page to find news related to the term 'Brexit'.
I've tried modifying the method parse_links so it only keeps the links (or 'a' tags), that contain 'Brexit' in them, but it doesn't seem to work.
Where should i place the condition?
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse
class MultiThreadScraper:
def __init__(self, base_url):
self.base_url = base_url
self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
self.pool = ThreadPoolExecutor(max_workers=20)
self.scraped_pages = set([])
self.to_crawl = Queue(10)
self.to_crawl.put(self.base_url)
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)
def scrape_info(self, html):
return
def post_scrape_callback(self, res):
result = res.result()
if result and result.status_code == 200:
self.parse_links(result.text)
self.scrape_info(result.text)
def scrape_page(self, url):
try:
res = requests.get(url, timeout=(3, 30))
return res
except requests.RequestException:
return
def run_scraper(self):
while True:
try:
target_url = self.to_crawl.get(timeout=60)
if target_url not in self.scraped_pages:
print("Scraping URL: {}".format(target_url))
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
except Empty:
return
except Exception as e:
print(e)
continue
if __name__ == '__main__':
s = MultiThreadScraper("https://elpais.com/")
s.run_scraper()
You need to import re module to get the specific text value.Try the below code.
import re
links = soup.find_all('a', text=re.compile("Brexit"))
This should return links which contains only Brexit.
You can get text of the element by using method getText() and check, if string actually contain "Brexit":
if "Brexit" in link.getText().split():
url = link["href"]
I added a check in this function. See if that does the rick for you:
def parse_links(self, html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', href=True)
for link in links:
if 'BREXIT' in link.text.upper(): #<------ new if statement
url = link['href']
if url.startswith('/') or url.startswith(self.root_url):
url = urljoin(self.root_url, url)
if url not in self.scraped_pages:
self.to_crawl.put(url)

Appending items Multiprocessing

In the function get_links, I am fetching the links of URLs. And in Scrape function, I am getting the content of each URL using text_from_html function( Not in the code). I want to append the url and visible_text into two lists containing urls and visible_text of each url. Here the list contains only one item and previous one is getting replaced. I want to keep the previous values also.
I'm getting the output as:
['https://www.scrapinghub.com']
['https://www.goodreads.com/quotes']
I need them in a single list.
def get_links(url):
visited_list.append(url)
try:
source_code = requests.get(url)
except Exception:
get_links(fringe.pop(0))
plain_text = source_code.text
soup = BeautifulSoup(plain_text,"lxml")
for link in soup.findAll(re.compile(r'(li|a)')):
href = link.get('href')
if (href is None) or (href in visited_list) or (href in fringe) or (('http://' not in href) and ('https://' not in href)):
continue
else:
subs = href.split('/')[2]
fstr = repr(fringe)
if subs in fstr:
continue
else:
if('blah' in href):
if('www' not in href):
href = href.split(":")[0] + ':' + "//" + "www." + href.split(":")[1][2:]
fringe.append(href)
else:
fringe.append(href)
return fringe
def test(url):
try:
res = requests.get(url)
plain_text = res.text
soup = BeautifulSoup(plain_text,"lxml")
visible_text = text_from_html(plain_text)
URL.append(url)
paragraph.append(visible_text)
except Exception:
print("CHECK the URL {}".format(url))
if __name__ == "__main__":
p = Pool(10)
p.map(test,fringe)
p.terminate()
p.join()

Categories