Scrape html files stored in remote directory - python

I have thousands of html files stored in a remote directory. All these files have same HTML structure. Right now I am scraping every file manually with the following script
from string import punctuation, whitespace
import urllib2
import datetime
import re
from bs4 import BeautifulSoup as Soup
import csv
today = datetime.date.today()
html = urllib2.urlopen("http://hostname/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html").read()
soup = Soup(html)
for li in soup.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
So the above script is for one URL. Like wise I wanna scrape through all the html files which are under that directory irrespective of the file names. I do not find that this question has been asked.
Update : Code
import urllib2
import BeautifulSoup
import re
Newlines = re.compile(r'[\r\n]\s+')
def getPageText(url):
# given a url, get page content
data = urllib2.urlopen(url).read()
# parse as html structured document
bs = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
# kill javascript content
for li in bs.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
def main():
urls = [
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html',
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html.html'
]
txt = [getPageText(url) for url in urls]
if __name__=="__main__":
main()

Use loop:
...
for url in url_list:
html = urllib2.urlopen(url).read()
soup = Soup(html)
for li in soup.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
print sLink['href']
sSpan = li.find('span', attrs={'class':'st'})
print sSpan
If you don't know url list in advance, you have to parse listing page.
import csv
import urllib2
import BeautifulSoup
def getPageText(url, filename):
data = urllib2.urlopen(url).read()
bs = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
with open(filename, 'w') as f:
writer = csv.writer(f)
for li in bs.findAll('li', attrs={'class':'g'}):
sLink = li.find('a')
sSpan = li.find('span', attrs={'class':'st'})
writer.writerow([sLink['href'], sSpan])
def main():
urls = [
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html',
'http://192.168.1.200/coimbatore/3BHK_flats_inCoimbatore.html_%94201308110608%94.html.html',
]
for i, url in enumerate(urls, 1):
getPageText(url, '{}.csv'.format(i))
if __name__=="__main__":
main()

Related

How to add a loop to scrape the next page of a website

My code below works but I want it to do the same exact thing but with the next page of the URL variable, this would be done by adding the number 1,2,3 depending on the page.
The code essentially scrapes a website that has the thumnails of various videos, it then returns the link to each video. I want it to do this for each page available
from bs4 import BeautifulSoup
import requests
import re
import urllib.request
from urllib.request import Request, urlopen
URL = "domain.com/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
endof = soup.find_all('div',class_="th-image")
links = [a['href'] for a in soup.find_all('a', href=True)]
endoflinks = links[8:-8]
index = 0
for a in endoflinks:
index+=1
dwnlink = "domain.com"+ endoflinks[index]
r = requests.get(dwnlink)
f = open("output.txt", "a")
print(r.url, file=f)
f.close()
This should help you get going:
URL = "domain.com/"
for i in list(range(0,10)):
print("domain.com/"+str(i))
r = requests.get(URL+str(i))
f = open("output.txt", "a")
print(r.url, file=f)
f.close()
domain.com/0
domain.com/1
domain.com/2
domain.com/3
domain.com/4
domain.com/5
domain.com/6
domain.com/7
domain.com/8
domain.com/9

BeautifulSoup with Table

I'm Web Scraping on Beautiful Soup and I am getting an error on line 13: for row in table.findAll('tr').
Its coming up an error on the cmd. Hope someone could help.
import csv
import requests
from bs4 import BeautifulSoup
url='http://www.dublincity.ie/dublintraffic/carparks.htm'
response = requests.get(url)
html= response.content
soup=BeautifulSoup(html)
table=soup.find('tbody', attrs={'id' :'itemsBody'})
list_of_rows=[]
for row in table.findAll('tr'):
list_of_cells=[]
for cell in row.findAll('td'):
text = cell.text.replace(' ','')
list_of_cells.append(text)
list_of_cells.append(list_of_cells)
outfile= open("./carpark.csv", "wb")
writer=csv.writer(outfile)
writer.writerows(["location","spaces"])
writer.writerows(list_of_rows)
If you wanna stick to BeautifulSoup then you can fetch and write the content using its xml parser along with csv.DictWriter(). Check out the implementation:
import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
res = requests.get(url)
soup = BeautifulSoup(res.content,"xml")
data = []
for item in soup.select("carpark"):
ditem = {}
ditem['Name'] = item.get("name")
ditem['Spaces'] = item.get("spaces")
data.append(ditem)
with open("xmldocs.csv","w",newline="") as f:
writer = csv.DictWriter(f,["Name","Spaces"])
writer.writeheader()
for info in data:
writer.writerow(info)
You could retrieve the data as an xml doc and then parse. This is just an example of part of process you could tailor.
import requests
from xml.etree import ElementTree
import pandas as pd
url = 'http://www.dublincity.ie/dublintraffic/cpdata.xml?1543254514266'
xml_data = requests.get(url).content
tree = ElementTree.fromstring(xml_data)
parking = []
for child in tree:
for nextChild in child:
parking.append([child.tag ,nextChild.attrib['name'],nextChild.attrib['spaces']])
df = pd.DataFrame(parking)
print(df)
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )

how do I create a list from a sitemap.xml file to extract the url in python?

I need to create a code to extract a word from one scrape of images.
I'll explain, from a page sitemap.xml ,my code must try in every link present in this xml file, found insiede each link if there a specific word, inside an image link.
the sitemap is adidas = http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml
this is the code i created for search the image contains the word "ZOOM" :
import requests
from bs4 import BeautifulSoup
html = requests.get(
'http://www.adidas.it/scarpe-superstar/C77124.html').text
bs = BeautifulSoup(html)
possible_links = bs.find_all('img')
for link in possible_links:
if link.has_attr('src'):
if link.has_key('src'):
if 'zoom' in link['src']:
print link['src']
but im search a metod to scrape a list in automatic
thankyou so much
i try to do this for have list :
from bs4 import BeautifulSoup
import requests
url = "http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml"
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data)
for url in soup.findAll("loc"):
print url.text
but i cant to attach request..
i can find the word "Zoom" in any link present in sitemap.xml
thankyou so much
import requests
from bs4 import BeautifulSoup
import re
def make_soup(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
return soup
# put urls in a list
def get_xml_urls(soup):
urls = [loc.string for loc in soup.find_all('loc')]
return urls
# get the img urls
def get_src_contain_str(soup, string):
srcs = [img['src']for img in soup.find_all('img', src=re.compile(string))]
return srcs
if __name__ == '__main__':
xml = 'http://www.adidas.it/on/demandware.static/-/Sites-adidas-IT-Library/it_IT/v/sitemap/product/adidas-IT-it-it-product.xml'
soup = make_soup(xml)
urls = get_xml_urls(soup)
# loop through the urls
for url in urls:
url_soup = make_soup(url)
srcs = get_src_contain_str(url_soup, 'zoom')
print(srcs)

Extract text from <li>

I extracted text from a blog, but in this format:
<li><b><a href="https://www.bloomberg.com/news/articles/article-id">Text</li>
I need to extract only text from it.
I tried this code:
from bs4 import BeautifulSoup
from urllib import request
import nltk, re, pprint
def getAllDoxyDonkeyPosts(url,links):
raw = request.urlopen(url).read()
raw = BeautifulSoup(raw, "lxml")
for a in raw.findAll('a'):
try:
url = a['href']
title = a['title']
if title == "Older Posts":
links.append(url)
getAllDoxyDonkeyPosts(url,links)
except:
title = ""
return
blogUrl = "http://doxydonkey.blogspot.in/"
links = []
getAllDoxyDonkeyPosts(blogUrl,links)
def getDoxyDonkeyText(url):
raw = request.urlopen(url).read()
raw = BeautifulSoup(raw, "lxml")
mydivs = raw.findAll("div", {"class":'post-body'})
posts = []
for div in mydivs:
posts+=div.findAll("li")
return posts
doxyDonkeyPosts = []
for link in links:
doxyDonkeyPosts+=getDoxyDonkeyText(link)
doxyDonkeyPosts
from bs4 import BeautifulSoup
t = '<li><b>Google</b></li>'
b = BeautifulSoup(t, "html.parser")
print b.text #--> Google

Scraping AJAX loaded content with python?

So i have function that is called when i click a button , it goes as below
var min_news_id = "68feb985-1d08-4f5d-8855-cb35ae6c3e93-1";
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id;
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
Now i don't have much experience with javascript , but i assume its returning some json data from some sort of api at "en/ajax/more_news" .
Is there i way could directly call this api and get the json data from my python script. If Yes,how?
If not how do i scrape the content that is being generated?
You need to post the news id that you see inside the script to https://www.inshorts.com/en/ajax/more_news, this is an example using requests:
from bs4 import BeautifulSoup
import requests
import re
# pattern to extract min_news_id
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
with requests.Session() as s:
soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content)
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
print(new_id_scr.text)
news_id = patt.search(new_id_scr.text).group()
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
print(js.json())
js gives you all the html, you just have to access the js["html"].
Here is the script that will automatically loop through all the pages in inshort.com
from bs4 import BeautifulSoup
from newspaper import Article
import requests
import sys
import re
import json
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
i = 0
while(1):
with requests.Session() as s:
if(i==0):soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content,"lxml")
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
news_id = patt.search(new_id_scr.text).group(1)
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
jsn = json.dumps(js.json())
jsonToPython = json.loads(jsn)
news_id = jsonToPython["min_news_id"]
data = jsonToPython["html"]
i += 1
soup = BeautifulSoup(data, "lxml")
for tag in soup.find_all("div", {"class":"news-card"}):
main_text = tag.find("div", {"itemprop":"articleBody"})
summ_text = main_text.text
summ_text = summ_text.replace("\n", " ")
result = tag.find("a", {"class":"source"})
art_url = result.get('href')
if 'www.youtube.com' in art_url:
print("Nothing")
else:
art_url = art_url[:-1]
#print("Hello", art_url)
article = Article(art_url)
article.download()
if article.is_downloaded:
article.parse()
article_text = article.text
article_text = article_text.replace("\n", " ")
print(article_text+"\n")
print(summ_text+"\n")
It gives both the summary from inshort.com and complete news from respective news channel.

Categories