I want to scrape all business links from this page - python

I want to extract all links(not title) of companies. Please guide me! Thanks!
here is the url of web page: https://hipages.com.au/find/antenna_services/nsw/sydney
here is my code:
import requests
from bs4 import BeautifulSoup
import re
def get_index_data(soup):
try:
links = soup.find_all('a', {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}).get('href')
except:
links = []
print(links)
def Main():
r = requests.get("https://hipages.com.au/find/antenna_services/nsw/sydney")
get_index_data(r)
Main()

import requests
from bs4 import BeautifulSoup
r = requests.get("https://hipages.com.au/find/antenna_services/nsw/sydney")
soup = BeautifulSoup(r.text, 'html.parser')
for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
print(f"https://hipages.com.au{item.previous_element.get('href')}")
Output:
https://hipages.com.au/connect/glencoelectricalbuildingmaintenanceairconditioningsecurityalarmscctv
https://hipages.com.au/connect/emcoelectricalservices
https://hipages.com.au/connect/abcelectricservicespl/service/126298
https://hipages.com.au/connect/ozyblindsnscreens
https://hipages.com.au/connect/samedaytvantennaservice
https://hipages.com.au/connect/langenelectricalnsw
https://hipages.com.au/connect/allprohandymanmaintenance
https://hipages.com.au/connect/amateairconditioningrefrigerationservices
https://hipages.com.au/connect/makeurmove
https://hipages.com.au/connect/uberantennas/service/184323
https://hipages.com.au/connect/cmkelectricalanddata
https://hipages.com.au/connect/antennadistributionservicesptyltd
https://hipages.com.au/connect/sydneysparky
https://hipages.com.au/connect/bluediamond
https://hipages.com.au/connect/digiproantennas
https://hipages.com.au/connect/vascom
https://hipages.com.au/connect/sparkyselectricalanddataptyltd
https://hipages.com.au/connect/prosparksolutions

Related

How can I change the code to make it such that the html tags do not appear

from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt)
with the above code, i get the output:
[<*h1 class="celeb-name">Ayden Sng</h1*>] #asterisks added to show h1 tags
What do i need to change in my code or how can i make it such that i only get 'Ayden Sng' as my output?
Iterate over each entry of the txt list and extract its txt property:
txt = [element.text for element in txt] # ['Ayden Sng']
Repl.it
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt[0].text)
if there are more than one reuslt you can use this code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
for i in txt:
print(i.text)

How do I make this web crawler print only the titles of the songs?

import requests
from bs4 import BeautifulSoup
url = 'https://www.officialcharts.com/charts/singles-chart'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a'):
print(link.get('href'))
def chart_spider(max_pages):
page = 1
while page >= max_pages:
url = "https://www.officialcharts.com/charts/singles-chart"
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll('a', {"class": "title"}):
href = "BAD HABITS" + link.title(href)
print(href)
page += 1
chart_spider(1)
Wondering how to make this print just the titles of the songs instead of the entire page. I want it to go through the top 100 charts and print all the titles for now. Thanks
Here's is a possible solution, which modify your code as little as possible:
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
URL = 'https://www.officialcharts.com/charts/singles-chart'
def chart_spider():
source_code = requests.get(URL)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, 'html.parser')
for title in soup.find_all('div', {"class": "title"}):
print(title.contents[1].string)
chart_spider()
The result is a list of all the titles found in the page, one per line.
If all you want is the titles for each song on the top 100,
this code:
import requests
from bs4 import BeautifulSoup
url='https://www.officialcharts.com/charts/singles-chart/'
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
titles = [i.text.replace('\n', '') for i in soup.find_all('div', class_="title")]
does what you are looking for.
You can do like this.
The Song title is present inside a <div> tag with class name as title.
Select all those <div> with .find_all(). This gives you a list of all <div> tags.
Iterate over the list and print the text of each div.
from bs4 import BeautifulSoup
import requests
url = 'https://www.officialcharts.com/charts/singles-chart/'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
d = soup.find_all('div', class_='title')
for i in d:
print(i.text.strip())
Sample Output:
BAD HABITS
STAY
REMEMBER
BLACK MAGIC
VISITING HOURS
HAPPIER THAN EVER
INDUSTRY BABY
WASTED
.
.
.

How do I extract the underlined value in red below and save it as a list?

How do I extract the underlined value in red below and save it as a list?
You want to extract the Memcode value in href to a in p tag using soup.
However, I don't know how to extract it at all.
Please help me.
My code
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
print(href)
try this, using css selector
import requests
from bs4 import BeautifulSoup
resp = requests.get('https://www.gjcouncil.go.kr/kr/member/name.do')
soup = BeautifulSoup(resp.text, "html.parser")
for a in soup.select("div[id='member_list'] > ul > li > a"):
print(a['href'].split("/")[2])
08070
00716
08040
....
....
You can use split on the "=" and take the -1 index. I also changed the class .
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
ids = [i['href'].split('=')[-1] for i in soup.select('.btn-home')]
import urllib.request
from bs4 import BeautifulSoup
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
href_list = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
if href['href'] == '#LINK':
pass
else:
href_list.append(href['href'][-7:])
print(href_list)
['7620212', '7670126', '7670420', '7650601', '7890930', '7800407', '7660925', '7641102', '7731222', '7801011', '7570803', '7770106', '7590808', '7700831', '7580115', '7710713', '7680112', '7621125', '7711117', '7680213', '7640925', '7591214']
One of the best method is using Regular-Expression.
Check out this code :
import urllib.request
from bs4 import BeautifulSoup
import re
url = "https://www.council.ulsan.kr/kor/councillor/viewByPerson.do"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
list_ = []
for href in soup.find("div", class_="memList memList-col-3").find_all("a"):
list_.append(href['href'])
regobj = re.compile(r'memCode=(\w+)')
final = list(filter('#LINK'.__ne__, list_))
result = list(map(lambda i: regobj.search(i).group(1) ,final))
print(result)

How to get multiple class to BeautifulSoup?

Trying to get torrent links from skidrowreloaded.
On the post detail page we have a div like this, I tried get this div by id but i think id is dynamic so I tried get this div by class but did not work,
<div id="tabs-105235-0-0" aria-labelledby="ui-id-1" class="ui-tabs-panel ui-widget-content ui-corner-bottom" role="tabpanel" aria-hidden="false">
the following code is returning none
source2 = source.find("div", {"class": "ui-tabs-panel ui-widget-content ui-corner-bottom"})
err:
AttributeError: 'NoneType' object has no attribute 'find_all'
full code:
import os
from bs4 import BeautifulSoup
import requests
import webbrowser
clear = lambda: os.system('cls')
clear()
r = requests.get('https://www.skidrowreloaded.com/')
source = BeautifulSoup(r.content,"lxml")
source2 = source.find_all("h2")
games = []
for i in source2:
games.append(i.a.get("href"))
lastgame = games[0]
r = requests.get(lastgame)
source = BeautifulSoup(r.content,"lxml")
source2 = source.find("div", {"class": "ui-tabs-panel ui-widget-content ui-corner-bottom"})
source3 = source2.find_all("a")
k = 0;
for i in source3:
if k == 0: #hide steam link.
k = k + 1
else:
if i.get("href") == "https://www.skidrowreloaded.com": #hide null links
pass
else: #throwing links to the browser
print(i.get("href"))
webbrowser.open(i.get("href"))
k = k + 1
To get all the links try this:
import requests
from bs4 import BeautifulSoup
url = "https://www.skidrowreloaded.com/projection-first-light-goldberg/"
soup = BeautifulSoup(requests.get(url).text, "html.parser").find_all("a", {"target": "_blank"})
skip = 'https://www.skidrowreloaded.com'
print([a['href'] for a in soup if a['href'].startswith('https') and a['href'] != skip])
Output:
['https://store.steampowered.com/app/726490/Projection_First_Light/', 'https://mega.nz/file/geogAATS#-0U0PklF-Q5i5l_SELzYx3klh5FZob9HaD4QKcFH_8M', 'https://uptobox.com/rqnlpcp7yb3v', 'https://1fichier.com/?0syphwpyndpo38af04ky', 'https://yadi.sk/d/KAmlsBmGaI1f2A', 'https://pixeldra.in/u/wmcsjuhv', 'https://dropapk.to/v6r7mjfgxjq6', 'https://gofile.io/?c=FRWL1o', 'https://racaty.net/dkvdyjqvg02e', 'https://bayfiles.com/L0k7Qea2pb', 'https://tusfiles.com/2q00y4huuv15', 'https://megaup.net/2f0pv/Projection.First.Light-GoldBerg.zip', 'https://letsupload.org/88t5', 'https://filesupload.org/0d7771dfef54d055', 'https://dl.bdupload.in/17ykjrifizrb', 'https://clicknupload.co/o0k9dnd3iwoy', 'https://dailyuploads.net/n1jihwjwdmjp', 'https://userscloud.com/nircdd4q1t5w', 'https://rapidgator.net/file/b6b8f5782c7c2bdb534214342b58ef18', 'https://turbobit.net/m308zh1hdpba.html', 'https://hitfile.net/5OhkcqZ', 'https://filerio.in/0wbvn4md4i91', 'https://mirrorace.org/m/1Fiic', 'https://go4up.com/dl/0ee9f4866312b5/Projection.First.Light-GoldBerg.zip', 'https://katfile.com/w74l823vuyw5/Projection.First.Light-GoldBerg.zip.html', 'https://multiup.org/download/3d355ba18d58234c792da7a872ab4998/Projection.First.Light-GoldBerg.zip', 'https://dl1.indishare.in/hs55pkx4ex82']
You can use find_all as noted in the BeautifulSoup documentation
import requests
from bs4 import BeautifulSoup
response = requests.get("your URL here")
soup = BeautifulSoup(response.text, 'html.parser')
raw_data = soup.find_all("div", class_="ui-tabs-panel ui-widget-content ui-corner-bottom")
# do something with the data
edit -
looking at the response.text, the div exists, but does not have the class you're looking for, hence it returns empty. You can search by using regex like so
import requests, re
from bs4 import BeautifulSoup
response = requests.get("your URL here")
soup = BeautifulSoup(response.text, 'html.parser')
raw_data = soup.find_all("div", id=re.compile("^tabs"))
for ele in raw_data:
a_tag = ele.find("a")
# do something with the a_tag

Beautiful Soup PYTHON - inside tags

Little problem with BeautifulSoup:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find_all('div', attrs={'class':'fl'}):
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
And it gives me twice records :V That probably is easy to solve :V
The same elements are in two places on page so you have to use find()/find_all() to select only one place i.e find(class_='list_list') in
soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
Full code:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
print(len(par_))
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
How about this. I used css selectors to do the same.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select('.fl p a'):
print("Item: {}\nItem_link: {}".format(item.text,urljoin(link,item['href'])))
Partial Output:
Item: CNNVD-201712-811
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-811
Item: CNNVD-201712-810
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-810
Item: CNNVD-201712-809
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-809

Categories