That's my first post in here so please be patient.
I'm trying to scrape all of the links having the particular word in (name of a city - Gdańsk ) it from my local news site.
The problem is, that I'm receiving some links which doesn't have the name of the city.
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import lxml
import re
url = 'http://www.trojmiasto.pl'
nazwa_pliku = 'testowyplik.txt'
user_agent = UserAgent()
strona = requests.get(url,headers={'user-agent':user_agent.chrome})
with open(nazwa_pliku,'w') as plik:
plik.write(page.content.decode('utf-8')) if type(page.content) == bytes else file.write(page.content)
def czytaj():
plikk = open('testowyplik.txt')
data = plikk.read()
plikk.close()
return data
soup = BeautifulSoup(czytaj(),'lxml')
linki = [li.div.a for div in soup.find_all('div',class_='entry-letter')]
for lin in linki:
print(lin)
rezultaty = soup.find_all('a',string=re.compile("Gdańsk"))
print(rezultaty)
l=[]
s=[]
for tag in rezultaty:
l.append(tag.get('href'))
s.append(tag.text)
for i in range(len(s)):
print('url = '+l[i])
print('\n')
Here is a complete and simpler example in Python 3:
import requests
from bs4 import BeautifulSoup
city_name = 'Gdańsk'
url = 'http://www.trojmiasto.pl'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
}
with requests.get(url, headers=headers) as html:
if html.ok:
soup = BeautifulSoup(html.content, 'html.parser')
links = soup('a')
for link in links:
if city_name in link.text:
print('\t- (%s)[%s]' % (link.text, link.get('href')))
Here is the output of the code above (formatted as Markdown just for clarity):
- [ZTM Gdańsk](//ztm.trojmiasto.pl/)
- [ZTM Gdańsk](https://ztm.trojmiasto.pl/)
- [Polepsz Gdańsk i złóż projekt do BO](https://www.trojmiasto.pl/wiadomosci/Polepsz-Gdansk-i-zloz-projekt-do-BO-n132827.html)
- [Pomnik Pileckiego stanie w Gdańsku](https://www.trojmiasto.pl/wiadomosci/Pomnik-rotmistrza-Witolda-Pileckiego-jednak-stanie-w-Gdansku-n132806.html)
- [O Włochu, który pokochał Gdańsk](https://rozrywka.trojmiasto.pl/Roberto-M-Polce-Polacy-maja-w-sobie-cos-srodziemnomorskiego-n132686.html)
- [Plakaty z poezją na ulicach Gdańska](https://kultura.trojmiasto.pl/Plakaty-z-poezja-na-ulicach-Gdanska-n132696.html)
- [Uniwersytet Gdański skończył 49 lat](https://nauka.trojmiasto.pl/Uniwersytet-Gdanski-skonczyl-50-lat-n132797.html)
- [Zapisz się na Półmaraton Gdańsk](https://aktywne.trojmiasto.pl/Zapisz-sie-na-AmberExpo-Polmaraton-Gdansk-2019-n132785.html)
- [Groźby na witrynach barów w Gdańsku](https://www.trojmiasto.pl/wiadomosci/Celtyckie-krzyze-i-grozby-na-witrynach-barow-w-Gdansku-n132712.html)
- [Stadion Energa Gdańsk](https://www.trojmiasto.pl/Stadion-Energa-Gdansk-o25320.html)
- [Gdańsk Big Beat Day 2019 ](https://www.trojmiasto.pl/rd/?t=p&id_polecamy=59233&url=https%3A%2F%2Fimprezy.trojmiasto.pl%2FGdansk-Big-Beat-Day-2019-imp475899.html&hash=150ce9c9)
- [ZTM Gdańsk](https://ztm.trojmiasto.pl/)
You could try attribute = value with contains operator (*)
rezultaty = [item['href'] for item in soup.select("[href*='Gdansk']")]
Full script
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('http://www.trojmiasto.pl')
soup = bs(r.content, 'lxml')
rezultaty = [item['href'] for item in soup.select("[href*='Gdansk']")]
print(rezultaty)
Without list comprehension:
for item in soup.select("[href*='Gdansk']"):
print(item['href'])
Related
I have tried with the code below and what the code does is to bring the first page and does not load completely the reviews for the movies. I am interested in getting all the movie titles, movie dates, and reviews.
enter code here
from bs4 import BeautifulSoup
import requests
url = 'https://www.nollywoodreinvented.com/list-of-all-reviews'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0) Gecko/20100101 Firefox/55.0',
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text, 'lxml')
movie_div = soup.find_all('div', class_='article-panel')
title=[]
for div in movie_div:
images= div.find_all('div', class_='article-image-wrapper')
for image in images:
image = image.find_all('div', class_='article-image')
for img in image:
title.append(img.a.img['title'])
date =[]
for div in movie_div:
date.append(div.find('div', class_='authorship type-date').text.strip())
info =[]
for div in movie_div:
info.append(div.find('div', class_='excerpt-text').text.strip())
import pandas as pd
movie = pd.DataFrame({'title':title, 'date':date, 'info':info}, index=None)
movie.head()
There is a backend api which serves up the HTML you are scraping you can see it in action if you open your browsers Developer Tools - Network tab - fetch/Xhr and click the on a the 2nd or 3rd page link, we can recreate the POST request with python like the below:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
pages = 3
results_per_page = 500 #max 500 I think
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://www.nollywoodreinvented.com/wp-admin/admin-ajax.php'
output = []
for page in range(1,pages+1):
payload = {
'action':'itajax-sort',
'view':'grid',
'loop':'main loop',
'location':'',
'thumbnail':'1',
'rating':'1',
'meta':'1',
'award':'1',
'badge':'1',
'authorship':'1',
'icon':'1',
'excerpt':'1',
'sorter':'recent',
'columns':'4',
'layout':'full',
'numarticles':str(results_per_page),
'largefirst':'',
'paginated':str(page),
'currentquery[category__in][]':'2648',
'currentquery[category__in][]':'2649'
}
resp = requests.post(url,headers=headers,data=payload).json()
print(f'Scraping page: {page} - results: {results_per_page}')
soup = BeautifulSoup(resp['content'],'html.parser')
for film in soup.find_all('div',class_='article-panel'):
try:
title = film.find('h3').text.strip()
except AttributeError:
continue
date = datetime.strptime(film.find('span',class_='date').text.strip(),"%B %d, %Y").strftime('%Y-%m-%d')
likes = film.find('span',class_='numcount').text.strip()
if not likes:
likes = 0
full_stars = [1 for _ in film.find_all('span',class_='theme-icon-star-full')]
half_stars = [0.5 for _ in film.find_all('span',class_='theme-icon-star-half')]
stars = (sum(full_stars)+ sum(half_stars))/2.0
item = {
'title':title,
'date':date,
'likes':likes,
'stars':stars
}
output.append(item)
df= pd.DataFrame(output)
df.to_csv('nollywood_data.csv',index=False)
print('Saved to nollywood_data.csv')
How to get all 'href' with soup in python ? I try so many times but in vain.
Whatever I use 'soup.find' or 'soup.find_all' method to strugle for the 'href', it doesn't work.
python version:3.10
!pip install requests
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
productlink = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
for page in range(1,2):
url = "https://www.momomall.com.tw/s/103487/dcategory/all/3/{page}"
r = requests.get(url, headers = headers)
Soup = BeautifulSoup(r.text,"lxml")
for link in Soup.find_all('ul',class_="searchItem Stype"):
print(len(link))
Link = link.li.a
LINK = Link.get('href')
print(LINK)
productlink.append(LINK)
print(productlink)
sorry i misunderstood totally your problem. find_all is not a very versatile tool and you were searching for the wrong ul
i barely changed your code but it seems to work now
import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
productlink = []
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
for page in range(1,2):
url = f"https://www.momomall.com.tw/s/103487/dcategory/all/3/{page}"
r = requests.get(url, headers = headers)
Soup = BeautifulSoup(r.text,"lxml")
for link in Soup.select('ul#surveyContent > li > a[href]:first-of-type'):
print(len(link))
# ~ Link = link.li.a
LINK = link.get('href')
print(LINK)
productlink.append(LINK)
print(productlink)
for page in range(1,2):
url = "https://m.momomall.com.tw/m/store/DCategory.jsp?entp_code=103487&category_code=all&orderby=3&page={}".format(page)
r = requests.get(url,headers = headers)
soup = BeautifulSoup(r.text,'lxml')
for goods_code in soup.select('a.nofollowBtn_star'):
Goods_code = 'https://www.momomall.com.tw/s/103487/'+goods_code.get('goods_code')+'/'
goodlink.append(Goods_code)
for URL in goodlink:
R = requests.get(URL, headers = headers)
Soup = BeautifulSoup(R.text,"lxml")
for dataprice in Soup.select('script'):
import re
discount_regex=re.compile('discountPrice = (\d{1,5})')
print(re.search(discount_regex, dataprice).group(1))
```
So i have taken the title of the medicines from this link : Medicines List
now i want to get the content for every medicines meanwhile every medicines has it owns link
Example :
Medicines Example
how can I get the content of that medicines using BeautifulSoup4 and requests library?
import requests
from bs4 import BeautifulSoup
from pprint import pp
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0'
}
def main(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
title = [x.text for x in soup.select(
'a[class$=section__item-link]')]
count = 0
for x in range (0, len(title)):
count += 1
print("{0}. {1}\n".format(count, title[x]))
main('https://www.klikdokter.com/obat')
Based on what I can see as the response from https://www.klikdokter.com/obat you should be able to do something like this:-
import requests
from bs4 import BeautifulSoup
AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'
BASEURL = 'https://www.klikdokter.com/obat'
headers = {'User-Agent': AGENT}
response = requests.get(BASEURL, headers=headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
for tag in soup.find_all('a', class_='topics-index--section__item-link'):
href = tag.get('href')
if href is not None:
print(href)
response = requests.get(href, headers=headers)
response.raise_for_status()
""" Do your processing here """
I've created a script in python making use of post http requests to get the search results from a webpage. To populate the results, it is necessary to click on the fields sequentially shown here. Now a new page will be there and this is how to populate the result.
There are ten results in the first page and the following script can parse the results flawlessly.
What I wish to do now is use the results to reach their inner page in order to parse Sole Proprietorship Name (English) from there.
website address
I've tried so far with:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
payload = {
'QueryString': '0',
'SourceAppCode': 'cambodia-br-soleproprietorships',
'OriginalVersionIdentifier': '',
'_CBASYNCUPDATE_': 'true',
'_CBHTMLFRAG_': 'true',
'_CBNAME_': 'buttonPush'
}
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = res.url.split("&")[0].replace("view.", "update.")
node = re.findall(r"nodeW\d.+?-Advanced",res.text)[0].strip()
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload[node] = 'N'
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[2]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.content, 'html.parser')
for item in soup.find_all("span", class_="appReceiveFocus")[3:]:
print(item.text)
How can I parse the Name (English) from each of the results inner page using requests?
This is one of the ways you can parse the name from the site's inner page and then email address from the address tab. I added this function .get_email() only because I wanted to let you know as to how you can parse content from different tabs.
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.businessregistration.moc.gov.kh/cambodia-master/service/create.html?targetAppCode=cambodia-master&targetRegisterAppCode=cambodia-br-soleproprietorships&service=registerItemSearch"
result_url = "https://www.businessregistration.moc.gov.kh/cambodia-master/viewInstance/update.html?id={}"
base_url = "https://www.businessregistration.moc.gov.kh/cambodia-br-soleproprietorships/viewInstance/update.html?id={}"
def get_names(s):
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
res = s.get(url)
target_url = result_url.format(res.url.split("id=")[1])
soup = BeautifulSoup(res.text,"lxml")
payload = {i['name']:i.get('value','') for i in soup.select('input[name]')}
payload['QueryString'] = 'a'
payload['SourceAppCode'] = 'cambodia-br-soleproprietorships'
payload['_CBNAME_'] = 'buttonPush'
payload['_CBHTMLFRAG_'] = 'true'
payload['_VIKEY_'] = re.findall(r"viewInstanceKey:'(.*?)',", res.text)[0].strip()
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNODE_'] = re.findall(r"Callback\('(.*?)','buttonPush", res.text)[-1]
payload['_CBHTMLFRAGNODEID_'] = re.findall(r"AsyncWrapper(W\d.+?)'",res.text)[0].strip()
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,"lxml")
payload.pop('_CBHTMLFRAGNODEID_')
payload.pop('_CBHTMLFRAG_')
payload.pop('_CBHTMLFRAGID_')
for item in soup.select("a[class*='ItemBox-resultLeft-viewMenu']"):
payload['_CBNAME_'] = 'invokeMenuCb'
payload['_CBVALUE_'] = ''
payload['_CBNODE_'] = item['id'].replace('node','')
res = s.post(target_url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
address_url = base_url.format(res.url.split("id=")[1])
node_id = re.findall(r"taba(.*)_",soup.select_one("a[aria-label='Addresses']")['id'])[0]
payload['_CBNODE_'] = node_id
payload['_CBHTMLFRAGID_'] = re.findall(r"guid:(.*?),", res.text)[0].strip()
payload['_CBNAME_'] = 'tabSelect'
payload['_CBVALUE_'] = '1'
eng_name = soup.select_one(".appCompanyName + .appAttrValue").get_text()
yield from get_email(s,eng_name,address_url,payload)
def get_email(s,eng_name,url,payload):
res = s.post(url,data=payload)
soup = BeautifulSoup(res.text,'lxml')
email = soup.select_one(".EntityEmailAddresses:contains('Email') .appAttrValue").get_text()
yield eng_name,email
if __name__ == '__main__':
with requests.Session() as s:
for item in get_names(s):
print(item)
Output are like:
('AMY GEMS', 'amy.n.company#gmail.com')
('AHARATHAN LIN LIANJIN FOOD FLAVOR', 'skykoko344#gmail.com')
('AMETHYST DIAMOND KTV', 'twobrotherktv#gmail.com')
To get the Name (English) you can simply replace print(item.text) with print(item.text.split('/')[1].split('(')[0].strip()) which prints AMY GEMS
I am trying to extract some information about an App on Google Play and BeautifulSoup doesn't seem to work.
The link is this(say):
https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts
My code:
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url)
html = r.content
soup = BeautifulSoup(html)
l = soup.find_all("div", { "class" : "document-subtitles"})
print len(l)
0 #How is this 0?! There is clearly a div with that class
I decided to go all in, didn't work either:
i = soup.select('html body.no-focus-outline.sidebar-visible.user-has-no-subscription div#wrapper.wrapper.wrapper-with-footer div#body-content.body-content div.outer-container div.inner-container div.main-content div div.details-wrapper.apps.square-cover.id-track-partial-impression.id-deep-link-item div.details-info div.info-container div.info-box-top')
print i
What am I doing wrong?
You need to pretend to be a real browser by supplying the User-Agent header:
import requests
from bs4 import BeautifulSoup
url = "https://play.google.com/store/apps/details?id=com.cimaxapp.weirdfacts"
r = requests.get(url, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})
html = r.content
soup = BeautifulSoup(html, "html.parser")
title = soup.find(class_="id-app-title").get_text()
rating = soup.select_one(".document-subtitle .star-rating-non-editable-container")["aria-label"].strip()
print(title)
print(rating)
Prints the title and the current rating:
Weird Facts
Rated 4.3 stars out of five stars
To get the additional information field values, you can use the following generic function:
def get_info(soup, text):
return soup.find("div", class_="title", text=lambda t: t and t.strip() == text).\
find_next_sibling("div", class_="content").get_text(strip=True)
Then, if you do:
print(get_info(soup, "Size"))
print(get_info(soup, "Developer"))
You will see printed:
1.4M
Email email#here.com