I want to crawl Naver blog with the following code, only posts 1 to 10 on the first page will be crawled. 11 ~ 20, 21 ~ 30 .... How do I edit to continue crawling
import sys
from bs4 import BeautifulSoup
import requests
import csv
BASE_URL = "https://search.naver.com/search.naver?where=post&sm=tab_pge&query=%ED%99%94%EC%A0%95%EC%B2%9C&st=sim&date_option=8&date_from=20160101&date_to=20161231&dup_remove=1&post_blogurl=&post_blogurl_without=&srchby=all&nso=p%3Afrom20160101to20161231&ie=utf8&start="
f = open("park01.csv", 'w', newline='')
wr =csv.writer(f)
for i in range(100):
URL_with_page_num = BASE_URL + str(1 + i*10)
response = requests.get(BASE_URL)
response.status_code
print (response.status_code)
dom = BeautifulSoup(response.content, "html.parser")
post_elements = dom.select("li.sh_blog_top")
for post_element in post_elements:
title_element = post_element.select_one("a.sh_blog_title")
passage_element = post_element.select_one("dd.sh_blog_passage")
title = title_element.text
url = title_element.get("href")
passage = passage_element.text
data=[title, url, passage]
wr.writerow(data)
f.close()
I guess the problem is in below code -
for i in range(100):
URL_with_page_num = BASE_URL + str(1 + i*10)
response = requests.get(BASE_URL)
put URL_with_page_num in place of BASE_URL in last line of above code
response = requests.get(URL_with_page_num)
Related
i used to web scrape a site which cantains 1000 pages and i wused to traverse each page with page no as 1,2,3...1000 and download data in excel now they have encrypted the page no. so code is no working need help. url is coming like this now
https://bidplus.gem.gov.in/bidlists?bidlists&page_no=Hgw0LYpSZdLXow1Wq84uKar1nxXbFhClXQDuAAiPDxU
code is written below.
pagination html
code
import concurrent
import functools
import concurrent.futures
import requests
from urllib3.exceptions import InsecureRequestWarning
import csv
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
def download_page(session, page_no):
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no)
print('URL created: ' + url)
resp = session.get(url, verify=False)
return resp.text
def scrap_bid_data():
NUMBER_THREADS =5 # number of concurrent download requests
with open('GEMconcurrent_1016.csv', 'w', newline='') as out_file:
f = csv.writer(out_file)
f.writerow(['Bidnumber', 'Items', 'Quantity', 'Department', 'Enddate','pageNumber'])
with requests.Session() as session:
page_downloader = functools.partial(download_page, session)
with concurrent.futures.ThreadPoolExecutor(max_workers=NUMBER_THREADS) as executor:
pages = executor.map(page_downloader, range(1,10))
page_no = 0
for page in pages:
page_no += 1
soup_data = bs(page, 'lxml')
extracted_data = soup_data.find('div', {'id': 'pagi_content'})
if extracted_data is None or len(extracted_data) == 0:
print('No data at page number', page_no)
print(page)
else:
for idx in range(len(extracted_data)):
if (idx % 2 == 1):
bid_data = extracted_data.contents[idx].text.strip().split('\n')
if (len(bid_data)>1):
print(page_no)
if (len(bid_data[8]) > 1 and len(bid_data[10].split(':'))>1 ):
bidno = bid_data[0].split(":")[-1]
items = bid_data[9].strip().split('Items:')[-1]
qnty = int(bid_data[10].split(':')[-1])
dept = (bid_data[11] + bid_data[16].strip()).split(":")[-1]
edate = bid_data[21].split("End Date:")[1]
f.writerow([bidno, items, qnty, dept, edate,page_no])
scrap_bid_data()
I'm learning python and webscraping, It is very cool but I am not able to get what I want.
I'm trying to save products links in a text file to scrape data after.
here is my script, which work correctly (almost) in the console of pycharm :
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
My goal is to save the result of the links variable, line by line in a text file.
I tried this, but something is wrong and I can't get each url :
for link in links:
with open("urls.txt", "a") as f:
f.write(links+"\n")
Please, does someone can help me?
You can try this way.
Just open the file once and write the complete data to it. Opening and closing files inside a loop is not a good thing to do.
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
with open('text.txt', 'w') as f:
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = 'https://www.topachat.com/' + a.get('href')
f.write(link+'\n')
Sample output from text.txt
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in11020650.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in10119254.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20005046.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002036.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002591.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20004309.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in20002592.html
https://www.topachat.com/pages/detail2_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_ref_est_in10089390.html
.
.
.
Your problem is in for link in links line:
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links)
for link in links:
with open("urls.txt", "a") as f:
f.write(links+"\n")
Type of links is string and your for loop iterates it letter-by-letter (or characater-by-character). That is why you see a single character at each line in your txt file. You can just remove the for loop and the code will work:
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
with open("urls.txt", "a") as f:
f.write(links+"\n")
You can do like this:
import bs4 as bs4
from bs4 import BeautifulSoup
import requests
suffixeUrl = '_puis_nblignes_est_200.html'
url_list = set()
for i in range(15):
url = 'https://www.topachat.com/pages/produits_cat_est_micro_puis_rubrique_est_w_boi_sa_puis_page_est_' + str(i) + suffixeUrl
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
if response.ok:
print('Page: ' + str(i))
for data in soup.find_all('div', class_='price'):
for a in data.find_all('a'):
link = (a.get('href'))
links = ('https://www.topachat.com/' + link)
print(links) #for getting link
url_list.add(links)
with open("urls.txt", "a") as f:
for link in url_list:
f.write(link+"\n")
I am trying to crawl multiple pages of a website. But the program can only crawl the first page.
import requests
from bs4 import BeautifulSoup
import re
import json
import time
def make_soup(url):
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
jsonData = pattern.search(script.text).group(1)
pattern_number = re.compile(r'\"[0-9]{9,12}\":(\{\"data\":\{\"cachedFilters\":(.*?)\}\}),\"[0-9]{9,11}\"')
jsonData2 = pattern_number.search(jsonData).group(1)
dictData = json.loads(jsonData2)
return dictData
def get_reviews(dictData):
""" Return a list of five dicts with reviews.
"""
all_dictionaries = []
for data in dictData['data']['locations']:
for reviews in data['reviewListPage']['reviews']:
review_dict = {}
review_dict["reviewid"] = reviews['id']
review_dict["reviewurl"] = reviews['absoluteUrl']
review_dict["reviewlang"] = reviews['language']
review_dict["reviewdate"] = reviews['createdDate']
userProfile = reviews['userProfile']
review_dict["author"] = userProfile['displayName']
all_dictionaries.append(review_dict)
return all_dictionaries
def main():
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list = get_reviews(dictData) # list with five dicts
#print(review_list)
page_number = 5
while page_number <= 260: # number in the URL
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list2 = get_reviews(dictData)
print(review_list2)
page_number += 5
time.sleep(0.5)
if __name__ == "__main__":
main()
And I'm not sure if I can crawl multiple pages with this URL. On the website there are 54 pages, but in the URL I always have to add the number 5, like this:
Page 1
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS
Page2
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or5-Coronado_Hotel-Zurich.html#REVIEWS
Page3
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or10-Coronado_Hotel-Zurich.html#REVIEWS
I don't know if this is a good idea.
Do you have any suggestions? Thank you in advance!
You assing new url to next_url but you use url to read page.
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
You have to rename variable
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-
events/?page=1'
#opening connection , downloading page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parser
page_soup = soup(page_html, "html.parser")
# catch each events
card = page_soup.findAll("div",{"class":"eds-media-card-content__content"})
filename = "Data_Events.csv"
f = open(filename, "w")
headers = "events_name, events_dates, events_location, events_fees\n"
f.write(headers)
for activity in card :
event_activity = activity.findAll("div",{"class":"eds-event-
card__formatted-name--is-clamped"})
events_name = event_activity[0].text
event_date = activity.findAll("div",{"class":"eds-text-bs--fixed eds-
text-color--grey-600 eds-l-mar-top-1"})
events_dates = event_date[0].text
events_location = event_date[1].text
events_fees = event_date[2].text
print("events_name: " + events_name)
print("events_dates: " + events_dates)
print("events_location: " + events_location)
print("events_fees: " + events_fees)
f.write(events_name + "," + events_dates + "," + events_location + "," +
events_fees + "\n")
f.close()
Hi, i am still a beginner in using Python language and i would like to know how can i apply a function where this script is able to get data to a next page within the website?
I have try to do a
for pages in page (1, 49)
my_url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-
events/?page=1'
Any advice would be appreciated
import itertools
import requests
from bs4 import BeautifulSoup
def parse_page(url, page)
params = dict(page=page)
resp = requests.get(url, params=params) # will format `?page=#` to url
soup = BeautifulSoup(resp.text, 'html.parser')
... # parse data from page
url = 'https://www.eventbrite.com/d/malaysia--kuala-lumpur--85675181/all-events'
for page in itertools.count(start=1): # don't need to know total pages
try:
parse_page(url, page)
except Exception:
# `parse_url` was designed for a different page layout and will
# fail when no more pages to scrape, so we break here
break
I've made this script but I tried few options to save data, but I keep messing up the code. How to save extracted data into and csv, or into an excel file?
import requests
from bs4 import BeautifulSoup
base_url = "http://www.privredni-imenik.com/pretraga?abcd=&keyword=&cities_id=0&category_id=0&sub_category_id=0&page=1"
current_page = 1
while current_page < 200:
print(current_page)
url = base_url + str(current_page)
#current_page += 1
r = requests.get(url)
zute_soup = BeautifulSoup(r.text, 'html.parser')
firme = zute_soup.findAll('div', {'class': 'jobs-item'})
for title in firme:
title1 = title.findAll('h6')[0].text
print(title1)
adresa = title.findAll('div', {'class': 'description'})[0].text
print(adresa)
kontakt = title.findAll('div', {'class': 'description'})[1].text
print(kontakt)
print('\n')
page_line = "{title1}\n{adresa}\n{kontakt}".format(
title1=title1,
adresa=adresa,
kontakt=kontakt
)
current_page += 1
A simple way to get a CSV would be to print each line separated by commas, and then use the operating system's ">" to write to a file.
import csv
import requests
from bs4 import BeautifulSoup
base_url = "http://www.privredni-imenik.com/pretraga?abcd=&keyword=&cities_id=0&category_id=0&sub_category_id=0&page=1"
current_page = 1
with open('scrape_results.csv', 'w', newline='') as scrape_results:
csvwriter = csv.writer(scrape_results)
while current_page < 200:
url = base_url + str(current_page)
r = requests.get(url)
zute_soup = BeautifulSoup(r.text, 'html.parser')
firme = zute_soup.findAll('div', {'class': 'jobs-item'})
for title in firme:
title1 = title.findAll('h6')[0].text
adresa = title.findAll('div', {'class': 'description'})[0].text
kontakt = title.findAll('div', {'class': 'description'})[1].text
csvwriter.writerow([current_page, title1, adresa, kontakt])
current_page += 1