Here is my code
driver = webdriver.Chrome()
path = "/home/winpc/test/python/dup/new"
def get_link_urls(url,driver):
driver.get(url)
url = urllib.urlopen(url)
content = url.readlines()
urls = []
for link in driver.find_elements_by_tag_name('a'):
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("outerHTML")
test = link.get_attribute('href')
if str(test) != 'None':
file_name=test.rsplit('/')[-1].split('.')[0]
file_name_formated = file_name + "Copy.html"
with open(os.path.join(path, file_name_formated), 'wb') as temp_file:
temp_file.write(source_code.encode('utf-8'))
urls.append(link.get_attribute('href'))
return urls
urls = get_link_urls("http://localhost:8080",driver)
sub_urls = []
for url in urls:
if str(url) != 'None':
sub_urls.extend(get_link_urls(url,driver))
This code properly navigating each and every link but all the time coppiny only the first html page.I need to save the source code of each and every page navigating.saving part is happening using below code:
file_name_formated = file_name + "Copy.html"
with open(os.path.join(path, file_name_formated), 'wb') as temp_file:
temp_file.write(source_code.encode('utf-8'))
First of all you're overwriting URL again and again in the function, so fix that one.
For saving page source through selenium, you can use driver.page_source
Additionally, if you want this code to be faster, consider using requests module.
response = requests.get(url).content
Related
I want to scrape class="cms-no-route cms-noroute-index page-layout-1column" in <body data-containr="body" class="cms-no-route cms-noroute-index page-layout-1column"> and save it in a txt file, but for some reason when I run the script nothing happens.
def get():
source = requests.get("https://shop.adidas.ae/en/yeezy-boost-350-v2-ash-pearl/GY7658.html", headers=randomheaders.LoadHeader()).text
soup = BeautifulSoup(source, 'lxml')
x = soup.find_all('body', datacontainer_="body")
url = x.get('class')
filename = "adidaslive.txt"
with open(filename, "r") as rf:
with open(filename, "a") as af:
if url not in rf:
print(url)
af.write("\n" + url)
else:
print("nothing")
def get():
source = ...
You need to properly indent your code
x = soup.find_all('body', datacontainer_="body")
x here is a list with only one element because, there is only one 'body' tag in your html source code.
I tried the following code to download all pdf file from the links but with that It download all files when I run these code every time. Recommended: First time it should download all pdf, and from next time it should download only which one is new.(it should check first which one is new)
My Code:
import requests
from bs4 import BeautifulSoup
root_url = 'https://www.iea.org'
def getLinks(url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
yearLinks = getLinks(root_url +'/oilmarketreport/reports/')
# get report URL
reportLinks = []
for url in yearLinks:
links = getLinks(url)
#reportLinks.extend(links)
#print(reportLinks)
i =0
for url_ in links:
if "AnnualStatisticalSupplement" not in url_:
url__ = url_.replace("org..", "org").replace("../", "")
response = requests.get(url__, stream=True)
lastindex= url__.rfind('/')
strlen = len(url__)
filename = url__[lastindex:strlen]
with open('/home/pdfs/'+ str(filename), 'wb') as pdffile:
pdffile.write(response.content)
i += 1
print(url__)
print("Download Completed")
Then I need to store that file is Mongo DB, How should i do that by making three column(pdf name, reported date, flag of process).
Sorry for the significant change in your code. because your code is too messy to read.
if you want to download the pdf you don't have since some time, you must add if-loop to control your action. by the way if you add page url into your database that you need not to access one more time to get the pdf name.
import requests
from bs4 import BeautifulSoup
root_url = 'https://www.iea.org'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
downloaded = ["2018-02-13.pdf"] # the latest i have
def getLinks(url):
page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.text, 'lxml')
li = soup.find_all("li",class_="omrlist")
links = [root_url + href.a.get('href') for href in li]
return links
def get_pdf(url,flag=1):
# find page link in the month directory
pdf_page = requests.get(url,headers=headers)
soup = BeautifulSoup(pdf_page.text, 'lxml')
li = soup.find_all("li",class_="omrlist")[::-1] # latest -> old
latest_pdf_set = [root_url + href.a.get('href') for href in li]
# find pdf link
pdf_links = []
for pdf_url in latest_pdf_set:
text = requests.get(pdf_url,headers=headers).text
soup = BeautifulSoup(text,"lxml")
link = soup.find("div",class_="omrreport pL10").find("a").get("href")
if link.split("/")[-1] in downloaded:
flag = 0 # if flag = 0 means you found the pdf that you already had
break
pdf_links.append(root_url + link)
return pdf_links,flag
yearLinks = getLinks(root_url +'/oilmarketreport/reports/')
all_ = []
for each in yearLinks:
pdf_links = get_pdf(each)
all_ += pdf_links[0]
if not pdf_links[1]:
# flag = 0 break
break
print(all_)
I'm working on a scraper using Selenium.
I have written the script and it is scraping properly, however I am trying to scrape multiple URL's, then write the results to JSON.
The script scrapes, and prints successfully, however I am only getting one result in the JSON - the second URL's detail (I am getting both results when printing).
How do I get both URL's results?
I think I need to add another FOR LOOP for the JSON data, but can't figure out how to add it in!
This is the code I am working with:
# -*- coding: UTF-8 -*-
from selenium import webdriver
import time
import json
def writeToJSONFile(path, fileName, data):
filePathNameWExt = './' + path + '/' + fileName + '.json'
with open(filePathNameWExt, 'a') as fp:
json.dump(data, fp, ensure_ascii=False)
browser = webdriver.Firefox(executable_path="/Users/path/geckodriver")
urls = ['https://www.tripadvisor.co.uk/Restaurant_Review-g186338-d8122594-Reviews-Humble_Grape_Battersea-London_England.html','https://www.tripadvisor.co.uk/Restaurant_Review-g186338-d5561842-Reviews-Gastronhome-London_England.html']
data = {}
for url in urls:
browser.get(url)
page = browser.find_element_by_class_name('non_hotels_like')
title = page.find_element_by_class_name('heading_title').text
street_address = page.find_element_by_class_name('street-address').text
print(title)
print(street_address)
data = {}
data['title'] = title
data['street_address'] = street_address
filename = 'properties'
writeToJSONFile('./', filename, data)
browser.quit()
You're trying to add values with the same keys to dictionary while Python dict can contain unique keys only! So instead of writing second title you're just overwriting it. The same with street_address
You can try to save data as list of dictionaries:
data = []
for url in urls:
browser.get(url)
page = browser.find_element_by_class_name('non_hotels_like')
title = page.find_element_by_class_name('heading_title').text
street_address = page.find_element_by_class_name('street-address').text
print(title)
print(street_address)
data.append({'title': title, 'street_address': street_address})
You are resting the data variable after the loop...
So... what I did is add the index of iteration using enumerate and formated it into the key...
Try this should work:
from selenium import webdriver
import time
import json
def writeToJSONFile(path, fileName, data):
filePathNameWExt = './' + path + '/' + fileName + '.json'
with open(filePathNameWExt, 'a') as fp:
json.dump(data, fp, ensure_ascii=False)
browser = webdriver.Firefox(executable_path="/Users/path/geckodriver")
urls = ['https://www.tripadvisor.co.uk/Restaurant_Review-g186338-d8122594-Reviews-Humble_Grape_Battersea-London_England.html','https://www.tripadvisor.co.uk/Restaurant_Review-g186338-d5561842-Reviews-Gastronhome-London_England.html']
data = {}
for i, url in enumerate(urls):
browser.get(url)
page = browser.find_element_by_class_name('non_hotels_like')
title = page.find_element_by_class_name('heading_title').text
street_address = page.find_element_by_class_name('street-address').text
# this 'f' string formating is suported from Python 3.6+ you can use other format... (for a cleaner job use list see the excpted answer...)
data[f'{i}title'] = title
data[f'{i}street_address'] = street_address
print(title)
print(street_address)
filename = 'properties'
writeToJSONFile('./', filename, data)
browser.quit()
Hope you find this helpful!
This is the code I used to take all the pics from r/pics on reddit and put it into a directory. I want to be able to take the actual files in the directory and put it into a list. Stuck on how to do this.
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/pics/"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('direct'):
os.makedirs('direct')
os.chdir('direct')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
f.close()
x+=1
except:
pass
Edit: Here is updated code but still dealing with problem
import requests
from bs4 import BeautifulSoup as bs
import os
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
soup = bs(data,'lxml')
image_tags = soup.findAll('img')
if not os.path.exists('directory'):
os.makedirs('directory')
os.chdir('directory')
x = 0
mylist = []
for image in image_tags:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
img_path = 'direct-' + str(x) +'.jpg'
with open(img_path, 'wb') as f:
f.write(requests.get(url).content)
mylist.append(img_path)
f.close()
x += 1
print(mylist)
create a list in the beginning of your code:
...
mylist = []
...
then after you get each image, add it to the list
...
img_path = 'direct-' + str(x) +'.jpg'
mylist.append(img_path)
....
EDIT:
I executed your updated code and the image_tags is returning empty - in fact the page returned by
url = "https://www.reddit.com/r/drawing"
r = requests.get(url)
data = r.text
Doesn't contain any images. I guess reddit has some kind of protection to prevent you from fetching images this way.
Try adding print(data) and you will see what I mean
You should use the reddit api so that reddit doesn't limit your requests.
I am new to python and just wanted to know if this is possible: I have scraped a url using urllib and want to edit different pages.
Example:
http://test.com/All/0.html
I want the 0.html to become 50.html and then 100.html and so on ...
found_url = 'http://test.com/All/0.html'
base_url = 'http://test.com/All/'
for page_number in range(0,1050,50):
url_to_fetch = "{0}{1}.html".format(base_url,page_number)
That should give you URLs from 0.html to 1000.html
If you want to use urlparse(as suggested in comments to your question):
import urlparse
found_url = 'http://test.com/All/0.html'
parsed_url = urlparse.urlparse(found_url)
path_parts = parsed_url.path.split("/")
for page_number in range(0,1050,50):
new_path = "{0}/{1}.html".format("/".join(path_parts[:-1]), page_number)
parsed_url = parsed_url._replace(path= new_path)
print parsed_url.geturl()
Executing this script would give you the following:
http://test.com/All/0.html
http://test.com/All/50.html
http://test.com/All/100.html
http://test.com/All/150.html
http://test.com/All/200.html
http://test.com/All/250.html
http://test.com/All/300.html
http://test.com/All/350.html
http://test.com/All/400.html
http://test.com/All/450.html
http://test.com/All/500.html
http://test.com/All/550.html
http://test.com/All/600.html
http://test.com/All/650.html
http://test.com/All/700.html
http://test.com/All/750.html
http://test.com/All/800.html
http://test.com/All/850.html
http://test.com/All/900.html
http://test.com/All/950.html
http://test.com/All/1000.html
Instead of printing in the for loop you can use the value of parsed_url.geturl() as per your need. As mentioned, if you want to fetch the content of the page you can use python requests module in the following manner:
import requests
found_url = 'http://test.com/All/0.html'
parsed_url = urlparse.urlparse(found_url)
path_parts = parsed_url.path.split("/")
for page_number in range(0,1050,50):
new_path = "{0}/{1}.html".format("/".join(path_parts[:-1]), page_number)
parsed_url = parsed_url._replace(path= new_path)
# print parsed_url.geturl()
url = parsed_url.geturl()
try:
r = requests.get(url)
if r.status_code == 200:
with open(str(page_number)+'.html', 'w') as f:
f.write(r.content)
except Exception as e:
print "Error scraping - " + url
print e
This fetches the content from http://test.com/All/0.html till http://test.com/All/1000.html and saves the content of each URL into its own file. The file name on disk would be the file name in URL - 0.html to 1000.html
Depending on the performance of the site you are trying to scrape from you might experience considerable time delays in running the script. If performance is of importance, you can consider using grequests