I extracted text from a blog, but in this format:
<li><b><a href="https://www.bloomberg.com/news/articles/article-id">Text</li>
I need to extract only text from it.
I tried this code:
from bs4 import BeautifulSoup
from urllib import request
import nltk, re, pprint
def getAllDoxyDonkeyPosts(url,links):
raw = request.urlopen(url).read()
raw = BeautifulSoup(raw, "lxml")
for a in raw.findAll('a'):
try:
url = a['href']
title = a['title']
if title == "Older Posts":
links.append(url)
getAllDoxyDonkeyPosts(url,links)
except:
title = ""
return
blogUrl = "http://doxydonkey.blogspot.in/"
links = []
getAllDoxyDonkeyPosts(blogUrl,links)
def getDoxyDonkeyText(url):
raw = request.urlopen(url).read()
raw = BeautifulSoup(raw, "lxml")
mydivs = raw.findAll("div", {"class":'post-body'})
posts = []
for div in mydivs:
posts+=div.findAll("li")
return posts
doxyDonkeyPosts = []
for link in links:
doxyDonkeyPosts+=getDoxyDonkeyText(link)
doxyDonkeyPosts
from bs4 import BeautifulSoup
t = '<li><b>Google</b></li>'
b = BeautifulSoup(t, "html.parser")
print b.text #--> Google
Related
I am writing a code that is supposed to open a url, identify the 3rd link and repeat this process 3 times (each time with the new url).
I wrote a loop (below), but it seems to each time sart over with the original url.
Can someone help me fix my code?
import urllib.request, urllib.parse, urllib.error
from urllib.parse import urljoin
from bs4 import BeautifulSoup
#blanc list
l = []
#starting url
url = input('Enter URL: ')
if len(url) < 1:
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
#loop
for _ in range(4):
html = urllib.request.urlopen(url).read() #open url
soup = BeautifulSoup(html, 'html.parser') #parse through BeautifulSoup
tags = soup('a') #extract tags
for tag in tags:
url = tag.get('href', None) #extract links from tags
l.append(url) #add the links to a list
url = l[2:3] #slice the list to extract the 3rd url
url = ' '.join(str(e) for e in url) #change the type to string
print(url)
Current Output:
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
Desired output:
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
http://py4e-data.dr-chuck.net/known_by_Mhairade.html
http://py4e-data.dr-chuck.net/known_by_Butchi.html
http://py4e-data.dr-chuck.net/known_by_Anayah.html
You need to define the empty list within the loop. The following code works:
import urllib.request, urllib.parse, urllib.error
from urllib.parse import urljoin
from bs4 import BeautifulSoup
#blanc list
# l = []
#starting url
url = input('Enter URL: ')
if len(url) < 1:
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
#loop
for _ in range(4):
l = []
html = urllib.request.urlopen(url).read() #open url
soup = BeautifulSoup(html, 'html.parser') #parse through BeautifulSoup
tags = soup('a') #extract tags
for tag in tags:
url = tag.get('href', None) #extract links from tags
l.append(url) #add the links to a list
url = l[2:3] #slice the list to extract the 3rd url
url = ' '.join(str(e) for e in url) #change the type to string
print(url)
Result in terminal:
http://py4e-data.dr-chuck.net/known_by_Montgomery.html
http://py4e-data.dr-chuck.net/known_by_Mhairade.html
http://py4e-data.dr-chuck.net/known_by_Butchi.html
http://py4e-data.dr-chuck.net/known_by_Anayah.html
Just making it simple for You in your way with 1 loop:
for _ in range(4):
html = urllib.request.urlopen(url).read() #open url
soup = BeautifulSoup(html, 'html.parser') #parse through BeautifulSoup
tag = soup('a')[2]
url = tag.get('href', None)
print(url)
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt)
with the above code, i get the output:
[<*h1 class="celeb-name">Ayden Sng</h1*>] #asterisks added to show h1 tags
What do i need to change in my code or how can i make it such that i only get 'Ayden Sng' as my output?
Iterate over each entry of the txt list and extract its txt property:
txt = [element.text for element in txt] # ['Ayden Sng']
Repl.it
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt[0].text)
if there are more than one reuslt you can use this code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
for i in txt:
print(i.text)
How do I extract the underlined value in red below and save it as a list?
You want to extract the Memcode value in href to a in p tag using soup.
However, I don't know how to extract it at all.
Please help me.
My code
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
print(href)
try this, using css selector
import requests
from bs4 import BeautifulSoup
resp = requests.get('https://www.gjcouncil.go.kr/kr/member/name.do')
soup = BeautifulSoup(resp.text, "html.parser")
for a in soup.select("div[id='member_list'] > ul > li > a"):
print(a['href'].split("/")[2])
08070
00716
08040
....
....
You can use split on the "=" and take the -1 index. I also changed the class .
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
ids = [i['href'].split('=')[-1] for i in soup.select('.btn-home')]
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
href_list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
if href['href'] == '#LINK':
pass
else:
href_list.append(href['href'][-7:])
print(href_list)
['7620212', '7670126', '7670420', '7650601', '7890930', '7800407', '7660925', '7641102', '7731222', '7801011', '7570803', '7770106', '7590808', '7700831', '7580115', '7710713', '7680112', '7621125', '7711117', '7680213', '7640925', '7591214']
One of the best method is using Regular-Expression.
Check out this code :
import urllib.request
from bs4 import BeautifulSoup
import re
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list_ = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
list_.append(href['href'])
regobj = re.compile(r'memCode=(\w+)')
final = list(filter('#LINK'.__ne__, list_))
result = list(map(lambda i: regobj.search(i).group(1) ,final))
print(result)
Trying to get torrent links from skidrowreloaded.
On the post detail page we have a div like this, I tried get this div by id but i think id is dynamic so I tried get this div by class but did not work,
<div id="tabs-105235-0-0" aria-labelledby="ui-id-1" class="ui-tabs-panel ui-widget-content ui-corner-bottom" role="tabpanel" aria-hidden="false">
the following code is returning none
source2 = source.find("div", {"class": "ui-tabs-panel ui-widget-content ui-corner-bottom"})
err:
AttributeError: 'NoneType' object has no attribute 'find_all'
full code:
import os
from bs4 import BeautifulSoup
import requests
import webbrowser
clear = lambda: os.system('cls')
clear()
r = requests.get('https://www.skidrowreloaded.com/')
source = BeautifulSoup(r.content,"lxml")
source2 = source.find_all("h2")
games = []
for i in source2:
games.append(i.a.get("href"))
lastgame = games[0]
r = requests.get(lastgame)
source = BeautifulSoup(r.content,"lxml")
source2 = source.find("div", {"class": "ui-tabs-panel ui-widget-content ui-corner-bottom"})
source3 = source2.find_all("a")
k = 0;
for i in source3:
if k == 0: #hide steam link.
k = k + 1
else:
if i.get("href") == "https://www.skidrowreloaded.com": #hide null links
pass
else: #throwing links to the browser
print(i.get("href"))
webbrowser.open(i.get("href"))
k = k + 1
To get all the links try this:
import requests
from bs4 import BeautifulSoup
url = "https://www.skidrowreloaded.com/projection-first-light-goldberg/"
soup = BeautifulSoup(requests.get(url).text, "html.parser").find_all("a", {"target": "_blank"})
skip = 'https://www.skidrowreloaded.com'
print([a['href'] for a in soup if a['href'].startswith('https') and a['href'] != skip])
Output:
['https://store.steampowered.com/app/726490/Projection_First_Light/', 'https://mega.nz/file/geogAATS#-0U0PklF-Q5i5l_SELzYx3klh5FZob9HaD4QKcFH_8M', 'https://uptobox.com/rqnlpcp7yb3v', 'https://1fichier.com/?0syphwpyndpo38af04ky', 'https://yadi.sk/d/KAmlsBmGaI1f2A', 'https://pixeldra.in/u/wmcsjuhv', 'https://dropapk.to/v6r7mjfgxjq6', 'https://gofile.io/?c=FRWL1o', 'https://racaty.net/dkvdyjqvg02e', 'https://bayfiles.com/L0k7Qea2pb', 'https://tusfiles.com/2q00y4huuv15', 'https://megaup.net/2f0pv/Projection.First.Light-GoldBerg.zip', 'https://letsupload.org/88t5', 'https://filesupload.org/0d7771dfef54d055', 'https://dl.bdupload.in/17ykjrifizrb', 'https://clicknupload.co/o0k9dnd3iwoy', 'https://dailyuploads.net/n1jihwjwdmjp', 'https://userscloud.com/nircdd4q1t5w', 'https://rapidgator.net/file/b6b8f5782c7c2bdb534214342b58ef18', 'https://turbobit.net/m308zh1hdpba.html', 'https://hitfile.net/5OhkcqZ', 'https://filerio.in/0wbvn4md4i91', 'https://mirrorace.org/m/1Fiic', 'https://go4up.com/dl/0ee9f4866312b5/Projection.First.Light-GoldBerg.zip', 'https://katfile.com/w74l823vuyw5/Projection.First.Light-GoldBerg.zip.html', 'https://multiup.org/download/3d355ba18d58234c792da7a872ab4998/Projection.First.Light-GoldBerg.zip', 'https://dl1.indishare.in/hs55pkx4ex82']
You can use find_all as noted in the BeautifulSoup documentation
import requests
from bs4 import BeautifulSoup
response = requests.get("your URL here")
soup = BeautifulSoup(response.text, 'html.parser')
raw_data = soup.find_all("div", class_="ui-tabs-panel ui-widget-content ui-corner-bottom")
# do something with the data
edit -
looking at the response.text, the div exists, but does not have the class you're looking for, hence it returns empty. You can search by using regex like so
import requests, re
from bs4 import BeautifulSoup
response = requests.get("your URL here")
soup = BeautifulSoup(response.text, 'html.parser')
raw_data = soup.find_all("div", id=re.compile("^tabs"))
for ele in raw_data:
a_tag = ele.find("a")
# do something with the a_tag
Little problem with BeautifulSoup:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find_all('div', attrs={'class':'fl'}):
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
And it gives me twice records :V That probably is easy to solve :V
The same elements are in two places on page so you have to use find()/find_all() to select only one place i.e find(class_='list_list') in
soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
Full code:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
print(len(par_))
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
How about this. I used css selectors to do the same.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select('.fl p a'):
print("Item: {}\nItem_link: {}".format(item.text,urljoin(link,item['href'])))
Partial Output:
Item: CNNVD-201712-811
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-811
Item: CNNVD-201712-810
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-810
Item: CNNVD-201712-809
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-809