Python Web Scraper IEEE - python

I am trying to retrieve keywords of a particular IEEE document. I came across this code here
ieee_content = requests.get(link, timeout=180)
soup = BeautifulSoup(ieee_content.text, 'lxml')
tag = soup.find_all('script')
#metadata = "".join(re.findall('global.document.metadata=(.*)', tag[9].text)).replace(";", '').replace('global.document.metadata=', '')
for i in tag[9]:
metadata_format = re.compile(r'global.document.metadata=.*', re.MULTILINE)
metadata = re.findall(metadata_format, i)
if len(metadata) != 0:
# convert the list
convert_to_json = json.dumps(metadata)
x = json.loads(convert_to_json)
s = x[0].replace("'", '"').replace(";", '')
The problem is that my metadata variable is always empty. I tried to iterate across all tags rather than using tag[9], but metadata is still empty in all cases. I tried using 'xml' instead of 'lmxl' as well but the result is the same. I'd appreciate some help with this.

import json
import re
from pprint import pprint
import requests
from bs4 import BeautifulSoup
ieee_content = requests.get("https://ieeexplore.ieee.org/document/7845555", timeout=180)
soup = BeautifulSoup(ieee_content.content, "html.parser")
scripts = soup.find_all("script")
pattern = re.compile(r"(?<=\"keywords\":)\[{.*?}\]")
keywords_dict = {}
for i, script in enumerate(scripts):
keywords = re.findall(pattern, str(script.string))
if len(keywords) == 1:
raw_keywords_list = json.loads(keywords[0])
for keyword_type in raw_keywords_list:
keywords_dict[keyword_type["type"].strip()] = [kwd.strip() for kwd in keyword_type["kwd"]]
pprint(keywords_dict)

Related

how to use python to parse a html that is in txt format?

I am trying to parse a txt, example as below link.
The txt, however, is in the form of html. I am trying to get "COMPANY CONFORMED NAME" which located at the top of the file, and my function should return "Monocle Acquisition Corp".
https://www.sec.gov/Archives/edgar/data/1754170/0001571049-19-000004.txt
I have tried below:
import requests
from bs4 import BeautifulSoup
url = 'https://www.sec.gov/Archives/edgar/data/1754170/0001571049-19-000004.txt'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html")
However, "soup" does not contain "COMPANY CONFORMED NAME" at all.
Can someone point me in the right direction?
The data you are looking for is not in an HTML structure so Beautiful Soup is not the best tool. The correct and fast way of searching for this data is just using a simple Regular Expression like this:
import re
import requests
url = 'https://www.sec.gov/Archives/edgar/data/1754170/0001571049-19-000004.txt'
r = requests.get(url)
text_string = r.content.decode()
name_re = re.compile("COMPANY CONFORMED NAME:[\\t]*(.+)\n")
match = name_re.search(text_string).group(1)
print(match)
the part you look like is inside a huge tag <SEC-HEADER>
you can get the whole section by using soup.find('sec-header')
but you will need to parse the section manually, something like this works, but it's some dirty job :
(view it in replit : https://repl.it/#gui3/stackoverflow-parsing-html)
import requests
from bs4 import BeautifulSoup
url = 'https://www.sec.gov/Archives/edgar/data/1754170/0001571049-19-000004.txt'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html")
header = soup.find('sec-header').text
company_name = None
for line in header.split('\n'):
split = line.split(':')
if len(split) > 1 :
key = split[0]
value = split[1]
if key.strip() == 'COMPANY CONFORMED NAME':
company_name = value.strip()
break
print(company_name)
There may be some library able to parse this data better than this code

Can't parse different product links from a webpage

I've created a script in Python to fetch different product links from a webpage. Although I know the content of that site are dynamic, I tried conventional way to let you inform that I tried. I looked for APIs in the dev tools but could not find one. Ain't there any way to get those links using requests?
Site Link
I've written so far:
import requests
from bs4 import BeautifulSoup
link = "https://www.amazon.com/stores/node/10699640011"
def fetch_product_links(url):
res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
for item_link in soup.select("[id^='ProductGrid-'] li[class^='style__itemOuter__'] > a"):
print(item_link.get("href"))
if __name__ == '__main__':
fetch_product_links(link)
How can I fetch different product links from that site using requests?
I think you only need the asins which you can collect from another url construct you can see in network tab i.e. you can significantly shorten the final urls. You do however need to make a request to your original url to pick up an identifier to use in second url. Returns 146 links.
import requests, re, json
node = '10699640011'
with requests.Session() as s:
r = s.get(f'https://www.amazon.com/stores/node/{node}')
p = re.compile(r'var slotsStr = "\[(.*?,){3} share\]";')
identifier = p.findall(r.text)[0]
identifier = identifier.strip()[:-1]
r = s.get(f'https://www.amazon.com/stores/slot/{identifier}?node={node}')
p = re.compile(r'var config = (.*?);')
data = json.loads(p.findall(r.text)[0])
asins = data['content']['ASINList']
links = [f'https://www.amazon.com/dp/{asin}' for asin in asins]
print(links)
EDIT:
With two given nodes:
import requests, re, json
from bs4 import BeautifulSoup as bs
nodes = ['3039806011','10699640011']
with requests.Session() as s:
for node in nodes:
r = s.get(f'https://www.amazon.com/stores/node/{node}')
soup = bs(r.content, 'lxml')
identifier = soup.select('.stores-widget-btf:not([id=share],[id*=RECOMMENDATION])')[-1]['id']
r = s.get(f'https://www.amazon.com/stores/slot/{identifier}?node={node}')
p = re.compile(r'var config = (.*?);')
data = json.loads(p.findall(r.text)[0])
asins = data['content']['ASINList']
links = [f'https://www.amazon.com/dp/{asin}' for asin in asins]
print(links)

Cannot get a specific href out of requests

I'm trying to capture a unique url using Pythons Requests
Source website is https://www.realestate.com.au/property/1-10-grosvenor-rd-terrigal-nsw-2260
Goal Url is http://www.realestate.com.au/sold/property-unit-nsw-terrigal-124570934
When i tried
(Unique_ID,) = (x.text_content() for x in tree.xpath('//a[#class="property-
value__link--muted rui-button-brand property-value__btn-listing"]'))
The CSV returned View Listing
Unless im mistaken, i've done the correct class search, as the href would not be unique enough? Am i supposed to do something different to capture URL's instead of text?
Full code below if required.
Thanks in advance.
import requests
import csv
import datetime
import pandas as pd
import csv
from lxml import html
df = pd.read_excel("C:\Python27\Projects\REA_UNIQUE_ID\\UN.xlsx", sheetname="UN")
dnc = df['Property']
dnc_list = list(dnc)
url_base = "https://www.realestate.com.au/property/"
URL_LIST = []
for nd in dnc_list:
nd = nd.strip()
nd = nd.lower()
nd = nd.replace(" ", "-")
URL_LIST.append(url_base + nd)
text2search = '''The information provided'''
with open('Auctions.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for index, url in enumerate(URL_LIST):
page = requests.get(url)
print '\r' 'Scraping URL ' + str(index+1) + ' of ' + str(len(URL_LIST)),
if text2search in page.text:
tree = html.fromstring(page.content)
(title,) = (x.text_content() for x in tree.xpath('//title'))
(Unique_ID,) = (x.text_content() for x in tree.xpath('//a[#class="property-value__link--muted rui-button-brand property- value__btn-listing"]'))
#(sold,) = (x.text_content().strip() for x in tree.xpath('//p[#class="property-value__agent"]'))
writer.writerow([title, Unique_ID])
text_content() allows you to get text only. Try to scrape #href as below
(Unique_ID,) = (x for x in tree.xpath('//a[#class="property-value__link--muted rui-button-brand property-value__btn-listing"]/#href'))

Scraping AJAX loaded content with python?

So i have function that is called when i click a button , it goes as below
var min_news_id = "68feb985-1d08-4f5d-8855-cb35ae6c3e93-1";
function loadMoreNews(){
$("#load-more-btn").hide();
$("#load-more-gif").show();
$.post("/en/ajax/more_news",{'category':'','news_offset':min_news_id},function(data){
data = JSON.parse(data);
min_news_id = data.min_news_id||min_news_id;
$(".card-stack").append(data.html);
})
.fail(function(){alert("Error : unable to load more news");})
.always(function(){$("#load-more-btn").show();$("#load-more-gif").hide();});
}
jQuery.scrollDepth();
Now i don't have much experience with javascript , but i assume its returning some json data from some sort of api at "en/ajax/more_news" .
Is there i way could directly call this api and get the json data from my python script. If Yes,how?
If not how do i scrape the content that is being generated?
You need to post the news id that you see inside the script to https://www.inshorts.com/en/ajax/more_news, this is an example using requests:
from bs4 import BeautifulSoup
import requests
import re
# pattern to extract min_news_id
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
with requests.Session() as s:
soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content)
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
print(new_id_scr.text)
news_id = patt.search(new_id_scr.text).group()
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
print(js.json())
js gives you all the html, you just have to access the js["html"].
Here is the script that will automatically loop through all the pages in inshort.com
from bs4 import BeautifulSoup
from newspaper import Article
import requests
import sys
import re
import json
patt = re.compile('var min_news_id\s+=\s+"(.*?)"')
i = 0
while(1):
with requests.Session() as s:
if(i==0):soup = BeautifulSoup(s.get("https://www.inshorts.com/en/read").content,"lxml")
new_id_scr = soup.find("script", text=re.compile("var\s+min_news_id"))
news_id = patt.search(new_id_scr.text).group(1)
js = s.post("https://www.inshorts.com/en/ajax/more_news", data={"news_offset":news_id})
jsn = json.dumps(js.json())
jsonToPython = json.loads(jsn)
news_id = jsonToPython["min_news_id"]
data = jsonToPython["html"]
i += 1
soup = BeautifulSoup(data, "lxml")
for tag in soup.find_all("div", {"class":"news-card"}):
main_text = tag.find("div", {"itemprop":"articleBody"})
summ_text = main_text.text
summ_text = summ_text.replace("\n", " ")
result = tag.find("a", {"class":"source"})
art_url = result.get('href')
if 'www.youtube.com' in art_url:
print("Nothing")
else:
art_url = art_url[:-1]
#print("Hello", art_url)
article = Article(art_url)
article.download()
if article.is_downloaded:
article.parse()
article_text = article.text
article_text = article_text.replace("\n", " ")
print(article_text+"\n")
print(summ_text+"\n")
It gives both the summary from inshort.com and complete news from respective news channel.

Get list of all paginated URL's from links in txt file in python requests

Hi Guys Define a Function to Get list of all paginated URLs at bottom from links in txt file in python.
Here is an example of what i need done.
Input link
http://www.apartmentguide.com/apartments/Alabama/Hartselle/
Desired Output
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=6
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=7
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=8
www.apartmentguide.com/apartments/Alabama/Hartselle/?page=9
so on to any limit each Input Url have.
This is the function i written so far but its not working i am not good with Python either .
import requests
#from bs4 import BeautifulSoup
from scrapy import Selector as Se
import urllib2
lists = open("C:\Users\Administrator\Desktop\\3.txt","r")
read_list = lists.read()
line = read_list.split("\n")
def get_links(line):
for each in line:
r = requests.get(each)
sel = Se(text=r.text, type="html")
next_ = sel.xpath('//a[#class="next sprite"]//#href').extract()
for next_1 in next_:
next_2 = "http://www.apartmentguide.com"+next_1
print next_2
get_links(next_1)
get_links(line)
Below are two ways to do this.
import mechanize
import requests
from bs4 import BeautifulSoup, SoupStrainer
import urlparse
import pprint
#-- Mechanize --
br = mechanize.Browser()
def get_links_mechanize(root):
links = []
br.open(root)
for link in br.links():
try:
if dict(link.attrs)['class'] == 'page':
links.append(link.absolute_url)
except:
pass
return links
#-- Requests / BeautifulSoup / urlparse --
def get_links_bs(root):
links = []
r = requests.get(root)
for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
if link.has_attr('href') and link.has_attr('class') and 'page' in link.get('class'):
links.append(urlparse.urljoin(root, link.get('href')))
return links
#with open("C:\Users\Administrator\Desktop\\3.txt","r") as f:
# for root in f:
# links = get_links(root)
# # <Do something with links>
root = 'http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
print "Mech:"
pprint.pprint( get_links_mechanize(root) )
print "Requests/BS4/urlparse:"
pprint.pprint( get_links_bs(root) )
One uses mechanize -- it's a bit smarter with URLs but it's a lot slower and may be overkill depending on what else you're doing.
The other uses requests to fetch the page (urllib2 would suffice), BeautifulSoup to parse the markup and urlparse to form absolute URLs from the relative URLs in the page you listed.
Note that both of these functions return the following list:
['http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=2',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=3',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=4',
'http://www.apartmentguide.com/apartments/Alabama/Hartselle/?page=5']
which has duplicates. You can get rid of the duplicates by changing
return links
to
return list(set(links))
for whatever method you choose.
EDIT:
I noticed that the above functions only returned the links to pages 2-5, and you'd have to navigate those pages to see that there were in fact 10 pages.
A completely different approach would be to scrape the "root" page for number of results, then predict how many pages that would result in, then build links from that.
Since there are 20 results per page, figuring out how many pages is straightforward, consider:
import requests, re, math, pprint
def scrape_results(root):
links = []
r = requests.get(root)
mat = re.search(r'We have (\d+) apartments for rent', r.text)
num_results = int(mat.group(1)) # 182 at the moment
num_pages = int(math.ceil(num_results/20.0)) # ceil(182/20) => 10
# Construct links for pages 1-10
for i in range(num_pages):
links.append("%s?page=%d" % (root, (i+1)))
return links
pprint.pprint(scrape_results(root))
This will be the fastest method of the 3, but possibly more error prone.
EDIT 2:
Maybe something like:
import re, math, pprint
import requests, urlparse
from bs4 import BeautifulSoup, SoupStrainer
def get_pages(root):
links = []
r = requests.get(root)
mat = re.search(r'We have (\d+) apartments for rent', r.text)
num_results = int(mat.group(1)) # 182 at the moment
num_pages = int(math.ceil(num_results/20.0)) # ceil(182/20) => 10
# Construct links for pages 1-10
for i in range(num_pages):
links.append("%s?page=%d" % (root, (i+1)))
return links
def get_listings(page):
links = []
r = requests.get(page)
for link in BeautifulSoup(r.text, parse_only=SoupStrainer('a')):
if link.has_attr('href') and link.has_attr('data-listingid') and 'name' in link.get('class'):
links.append(urlparse.urljoin(root, link.get('href')))
return links
root='http://www.apartmentguide.com/apartments/Alabama/Hartselle/'
listings = []
for page in get_pages(root):
listings += get_listings(page)
pprint.pprint(listings)
print(len(listings))
With Re i was unsure ,so tried xpath.
links = open("C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\2.txt","r")
read_list = links.read()
line = read_list.split("\n")
for each in line:
lines = []
r = requests.get(each)
sel = Selector(text=r.text,type="html")
mat = sel.xpath('//h1//strong/text()').extract()
mat = str(mat)
mat1 = mat.replace(" apartments for rent']","")
mat2 = mat1.replace("[u'","")
mat3 = int(mat2)
num_pages = int(math.ceil(mat3/20.0))
for i in range(num_pages):
lines.append("%s/Page%d" % (each, (i+1)))
with open('C:\Users\ssamant\Desktop\Anida\Phase_II\Apartmentfinder\\test.csv', 'ab') as f:
writer = csv.writer(f)
for val in lines:
writer.writerow([val])

Categories