Find subpage urls with articles and collect data from them - python

The script should find the addresses of subpages with articles and collect the necessary data from them. The data should go to the database but I don't know how to make the script pull the content of each article from every page of the blog.
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
url = 'https://xxx/'
r = requests.get(url)
# Extract HTML
html = r.text
# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup(html, "html5lib")
# Get the text
text = soup.get_text()
# Create tokenizer
tokenizer = RegexpTokenizer('\w+')
# Create tokens
tokens = tokenizer.tokenize(text)
# Initialize new list
words = []
# Loop through list
for word in tokens:
words.append(word.lower())
# Get English stopwords and print some of them
sw = nltk.corpus.stopwords.words('english')
# Initialize new list
words_ns = []
for word in words:
if word not in sw:
words_ns.append(word)
# plotting
freqdist1 = nltk.FreqDist(words_ns)
freqdist1.plot(25)
print(soup.get_text())

You could do the whole thing with beautifulsoup as requests. The text extraction code is by #nmgeek; the same question there has other methods to choose from. I am guessing you can then handle the text with nltk. The method is nice as you can play with which selectors you add to list. You can achieve something similar with selector list passed to select i.e. [item.text for item in soup.select('selector list goes here')
Edit: Below gets you all the links but seems website blocks you after a while. Have a look at rotating IPs and these/User-Agents in the loop over all_links
If you have to resort to selenium at least you have the list of all article links to you can loop over and .get with selenium
import requests
from bs4 import BeautifulSoup as bs
url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []
headers = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent' : 'Mozilla/5.0'
}
with requests.Session() as s:
r = s.get('https://teonite.com/blog/')
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
num_pages = int(soup.select_one('.page-number').text.split('/')[1])
for page in range(2, num_pages + 1):
r = s.get(url.format(page))
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
all_links = [item for i in all_links for item in i]
for article in all_links:
#print(article)
r = s.get(article, headers = headers)
soup = bs(r.content, 'lxml')
[t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText() # taken from https://stackoverflow.com/a/19760007/6241235 #nmgeek
# here I think you need to consider IP rotation/User-Agent changing
try:
print(soup.select_one('.post-title').text)
except:
print(article)
print(soup.select_one('h1').text)
break
# do something with text
Adding in selenium seems to definitely solve bad request problem of being blocked:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []
with requests.Session() as s:
r = s.get('https://teonite.com/blog/')
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
num_pages = int(soup.select_one('.page-number').text.split('/')[1])
for page in range(2, num_pages + 1):
r = s.get(url.format(page))
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
all_links = [item for i in all_links for item in i]
d = webdriver.Chrome()
for article in all_links:
d.get(article)
soup = bs(d.page_source, 'lxml')
[t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText() # taken from https://stackoverflow.com/a/19760007/6241235 #nmgeek
try:
print(soup.select_one('.post-title').text)
except:
print(article)
print(soup.select_one('h1').text)
break #for debugging
# do something with text
d.quit()

Related

How to get the tokens in data-search-meta-sol

def extract(page):
url = f'https://www.jobstreet.com.my/en/job-search/administrative-assistant-jobs/{page}/'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def transform(soup):
jobs = soup.find_all('div', class_='sx2jih0 zcydq876 zcydq866 zcydq896 zcydq886 zcydq8n zcydq856 zcydq8f6 zcydq8eu')
for job in jobs[:29]:
for token in job.find_all('div', attrs={'data-search-sol-meta': True}):
more_details = token.text.strip()
job_detail = {
'more details': more_details
}
joblist.append(job_detail)
joblist = []
dummy = 2
for i in range(0, dummy, 1):
c = extract(i + 1)
transform(c)
print(f'Progress Page: [{int(i) + 1}/{dummy}]')
time.sleep(4)
df = pd.DataFrame(joblist)
I want to scrape the tokens in those data-search-sol-meta tags, how to i get it?
<div data-search-sol-meta="{"searchRequestToken":"62781aeb-4a14-43c9-b985-8be617cc1107","token":"0~62781aeb-4a14-43c9-b985-8be617cc1107","jobId":"jobstreet-my-job-5011156","section":"MAIN","sectionRank":1,"jobAdType":"ORGANIC","tags":{"mordor__flights":"mordor_80","jobstreet:userGroup":"BB","jobstreet:s_vi":"[CS]v1|314CC40D0D655F39-400007A66AC825EB[CE]"}}">
the results in the pd (more_details column) that I've got is just "None"
I would use a more robust css selector list i.e. not the dynamic classes. Be high enough in the DOM to be able to select both the attributes you want and then the job info. You can extract the attribute with the tokens and use json library to list separately.
import requests, json
from bs4 import BeautifulSoup
def extract(page):
url = f"https://www.jobstreet.com.my/en/job-search/administrative-assistant-jobs/{page}/"
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
return soup
def transform(soup):
jobs = soup.select("[data-automation=jobListing] > div:has(article)")
for job in jobs:
print(job.select_one("h1 span").text)
print()
print(job["data-search-sol-meta"])
print()
data = json.loads(job["data-search-sol-meta"])
print("searchRequestToken: ", data["searchRequestToken"])
print("token: ", data["token"])
print()
soup = extract(1)
transform(soup)

Iterating over urls fails to find correct href in Python using BeautifulSoup

I am iterating through the website in the code. The following is what my code does. Loops through the 52 pages and gets the link to each URLs.
Then it iterates through those URLs and tries to get the link for the English Translation. if you see the Mongolian website, it has a section "Орчуулга" on the top right and it has "English" underneath - that is the link to the English translation.
However, my code fails to grab the link for the english translation and gives a wrong url.
Below is a sample output for the first article.
1
{'https://mn.usembassy.gov/mn/2020-naadam-mn/': 'https://mn.usembassy.gov/mn/sitemap-mn/'}
The expected output for the first page should be
1
{'https://mn.usembassy.gov/mn/2020-naadam-mn/': 'https://mn.usembassy.gov/2020-naadam/'}
Below is my code
import requests
from bs4 import BeautifulSoup
url = 'https://mn.usembassy.gov/mn/news-events-mn/page/{page}/'
urls = []
for page in range(1, 53):
print(str(page) + "/52")
soup = BeautifulSoup(requests.get(url.format(page=page)).content, 'html.parser')
for h in soup.find_all('h2'):
a = h.find('a')
urls.append(a.attrs['href'])
print(urls)
i = 0
bilingual_dict = {}
for url in urls:
i += 1
print(i)
soup = BeautifulSoup(requests.get(url.format(page=url)).content, 'html.parser')
for div in soup.find_all('div', class_='translations_sidebar'):
for ul in soup.find_all('ul'):
for li in ul.find_all('li'):
a = li.find('a')
bilingual_dict[url] = a['href']
print(bilingual_dict)
print(bilingual_dict)
This script will print link to english translation:
import requests
from bs4 import BeautifulSoup
url = 'https://mn.usembassy.gov/mn/2020-naadam-mn/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
link = soup.select_one('a[hreflang="en"]')
print(link['href'])
Prints:
https://mn.usembassy.gov/2020-naadam/
Complete code: (Where there isn't link to english translation, the value is set to None)
import requests
from bs4 import BeautifulSoup
from pprint import pprint
url = 'https://mn.usembassy.gov/mn/news-events-mn/page/{page}/'
urls = []
for page in range(1, 53):
print('Page {}...'.format(page))
soup = BeautifulSoup(requests.get(url.format(page=page)).content, 'html.parser')
for h in soup.find_all('h2'):
a = h.find('a')
urls.append(a.attrs['href'])
pprint(urls)
bilingual_dict = {}
for url in urls:
print(url)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
link = soup.select_one('a[hreflang="en"]')
bilingual_dict[url] = link['href'] if link else None
pprint(bilingual_dict)

Getting web links to all items in a table and then doing pagination

I am able to get all the links on a particular web page but am having trouble with the pagination.
I am doing the following:
import requests, bs4, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
r = requests.get(start_url)
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
print(a_tags)
links = [urljoin(start_url, a['href'])for a in a_tags]
print(links)
As a toy example, I am using the following website:
start_url = 'https://www.opencodez.com/page/1'
I am able to get all the links this way. However, I am trying to automate it more by going to the next page and doing the same thing, and outputting all the links to a csv file.
I tried the following but get no outputs:
start_url = 'https://www.opencodez.com/'
with open('names.csv', mode='w') as csv_file:
fieldnames = ['Name']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
article_link = []
def scraping(webpage, page_number):
next_page = webpage + str(page_number)
r = requests.get(str(next_page))
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
print(a_tags)
links = [urljoin(start_url, a['href'])for a in a_tags]
print(links)
for x in range(len(soup)):
article_link.append(links)
if page_number < 16:
page_number = page_number + 1
scraping(webpage, page_number)
scraping('https://www.opencodez.com/page/', 1)
#creating the data frame and populating its data into the csv file
data = { 'Name': article_link}
df = DataFrame(data, columns = ['Article_Link'])
df.to_csv(r'C:\Users\xxxxx\names.csv')
Could you please help me determine where I am going wrong?
I do not mind getting the links in either the output console or printed in a csv file
There were issues here and there with your code but this worked for me:
import requests, bs4, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
start_url = 'https://www.opencodez.com/'
r = requests.get(start_url) # first page scraping
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
article_link = []
links = [urljoin(start_url, a['href'])for a in a_tags]
article_link.append(links)
for page in range(2,19): # for every page after 1
links = [] # resetting lists on every page just in case
a_tags = []
url = 'https://www.opencodez.com/page/'+str(page)
r = requests.get(start_url)
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
links = [urljoin(start_url, a['href'])for a in a_tags]
article_link.append(links)
print(article_link)
I basically just changed how you append to the list article_link. This variable at the moment is a list of length 18. Each list within article_link is a list of 136 links.

Beautiful Soup PYTHON - inside tags

Little problem with BeautifulSoup:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find_all('div', attrs={'class':'fl'}):
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
And it gives me twice records :V That probably is easy to solve :V
The same elements are in two places on page so you have to use find()/find_all() to select only one place i.e find(class_='list_list') in
soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
Full code:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
print(len(par_))
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
How about this. I used css selectors to do the same.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select('.fl p a'):
print("Item: {}\nItem_link: {}".format(item.text,urljoin(link,item['href'])))
Partial Output:
Item: CNNVD-201712-811
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-811
Item: CNNVD-201712-810
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-810
Item: CNNVD-201712-809
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-809

Web crawler - following links

Please bear with me. I am quite new at Python - but having a lot of fun. I am trying to code a web crawler that crawls through election results from the last referendum in Denmark. I have managed to extract all the relevant links from the main page. And now I want Python to follow each of the 92 links and gather 9 pieces of information from each of those pages. But I am so stuck. Hope you can give me a hint.
Here is my code:
import requests
import urllib2
from bs4 import BeautifulSoup
# This is the original url http://www.kmdvalg.dk/
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
my_list = []
all_links = soup.find_all("a")
for link in all_links:
link2 = link["href"]
my_list.append(link2)
for i in my_list[1:93]:
print i
# The output shows all the links that I would like to follow and gather information from. How do I do that?
Here is my solution using lxml. It's similar to BeautifulSoup
import lxml
from lxml import html
import requests
page = requests.get('http://www.kmdvalg.dk/main')
tree = html.fromstring(page.content)
my_list = tree.xpath('//div[#class="LetterGroup"]//a/#href') # grab all link
print 'Length of all links = ', len(my_list)
my_list is a list consist of all links. And now you can use for loop to scrape information inside each page.
We can for loop through each links. Inside each page, you can extract information as example. This is only for the top table.
table_information = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
table_key = tree.xpath('//td[#class="statusHeader"]/text()')
table_value = tree.xpath('//td[#class="statusText"]/text()') + tree.xpath('//td[#class="statusText"]/a/text()')
table_information.append(zip([t]*len(table_key), table_key, table_value))
For table below the page,
table_information_below = []
for t in my_list:
page_detail = requests.get(t)
tree = html.fromstring(page_detail.content)
l1 = tree.xpath('//tr[#class="tableRowPrimary"]/td[#class="StemmerNu"]/text()')
l2 = tree.xpath('//tr[#class="tableRowSecondary"]/td[#class="StemmerNu"]/text()')
table_information_below.append([t]+l1+l2)
Hope this help!
A simple approach would be to iterate through your list of urls and parse them each individually:
for url in my_list:
soup = BeautifulSoup(urllib2.urlopen(url).read())
# then parse each page individually here
Alternatively, you could speed things up significantly using Futures.
from requests_futures.sessions import FuturesSession
def my_parse_function(html):
"""Use this function to parse each page"""
soup = BeautifulSoup(html)
all_paragraphs = soup.find_all('p')
return all_paragraphs
session = FuturesSession(max_workers=5)
futures = [session.get(url) for url in my_list]
page_results = [my_parse_function(future.result()) for future in results]
This would be my solution for your problem
import requests
from bs4 import BeautifulSoup
def spider():
url = "http://www.kmdvalg.dk/main"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('div', {'class': 'LetterGroup'}):
anc = link.find('a')
href = anc.get('href')
print(anc.getText())
print(href)
# spider2(href) call a second function from here that is similar to this one(making url = to herf)
spider2(href)
print("\n")
def spider2(linktofollow):
url = linktofollow
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('tr', {'class': 'tableRowPrimary'}):
anc = link.find('td')
print(anc.getText())
print("\n")
spider()
its not done... i only get a simple element from the table but you get the idea and how its supposed to work.
Here is my final code that works smooth. Please let me know if I could have done it smarter!
import urllib2
from bs4 import BeautifulSoup
import codecs
f = codecs.open("eu2015valg.txt", "w", encoding="iso-8859-1")
soup = BeautifulSoup(urllib2.urlopen('http://www.kmdvalg.dk/').read())
liste = []
alle_links = soup.find_all("a")
for link in alle_links:
link2 = link["href"]
liste.append(link2)
for url in liste[1:93]:
soup = BeautifulSoup(urllib2.urlopen(url).read().decode('iso-8859-1'))
tds = soup.findAll('td')
stemmernu = soup.findAll('td', class_='StemmerNu')
print >> f, tds[5].string,";",tds[12].string,";",tds[14].string,";",tds[16].string,";", stemmernu[0].string,";",stemmernu[1].string,";",stemmernu[2].string,";",stemmernu[3].string,";",stemmernu[6].string,";",stemmernu[8].string,";",'\r\n'
f.close()

Categories