I'm trying to create a presentation in python and display it on my website withe the flask framework. Creating the powerpoint isn't too hard with the help of python-pptx but i cannot get to display it on my website.
Python:
def make_presentation(topic):
prs = Presentation()
title_slide_layout = prs.slide_layouts[0]
slide = prs.slides.add_slide(title_slide_layout)
title = slide.shapes.title
subtitle = slide.placeholders[1]
title.text = f"{topic}"
subtitle.text = "Test"
prs.save('static/test.pptx')
HTML:
<iframe src="static/test.pptx" width="1000px" height="600px"></iframe>
Related
i tried to extract job title, company, location and description.
example of job page
i managed to get job title, company, location since each span has their class name.
i struggle to get job description because they don't have class for their span. part of their description also located in a list. i tried to extract text using absolute xpath it doesn't work.
from io import StringIO
from parsel import Selector
from time import sleep
from selenium.webdriver.edge.service import Service
from selenium import webdriver
driver = webdriver.Edge('C:/Users/users/Downloads/edgedriver_win64/msedgedriver.exe')
#accessing linkedin
driver.get('https://www.linkedin.com')
# login
username = driver.find_element_by_name('session_key')
username.send_keys(parameter.email)
password = driver.find_element_by_name('session_password')
password.send_keys(parameter.password)
submit = driver.find_element_by_class_name('sign-in-form__submit-button')
submit.click()
sleep(2)
driver.get(parameter.siteQuery)
sleep(5)
wait = WebDriverWait(driver, 20)
links = driver.find_elements_by_xpath("//a[#class='disabled ember-view job-card-container__link']")
links =[link.get_attribute("href") for link in links]
sleep(1)
for link in links :
driver.get(links)
sleep(5)
# moreinfo =driver.find_element_by_class_name('artdeco-card__action')
# moreinfo.click()
sel= Selector(text=driver.page_source)
title = sel.xpath('//h1[#class="t-24 t-bold"]/text()').extract()
company = sel.xpath('//span[#class="jobs-unified-top-card__company-name"]/text()').extract()
location = sel.xpath('//span[#class="jobs-unified-top-card__bullet"]/text()').extract()
description = sel.xpath('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div[2]/article/div/div[1]/span/text()').extract()
i tried extracting straight form div also didn't work
description = sel.xpath('//*[#id="jobs-details"]/span/text()').extract()
i use parsel.Selector and Selenium
any idea how to get the whole description?
thanks before
i figure it out after i realize i can just extract the whole html elements. so i just take the whole span and then clean them later
Extracting from span :
descriptions = sel.xpath('//*[#id="job-details"]').extract()
listDescriptions.append(descriptions[0])
Cleaning the data :
listDescriptions2=[]
for description in listDescriptions :
description = BeautifulSoup(description,features='html.parser').text
description = re.sub(r'\n', '', description)
listDescriptions2.append(description)
I'm trying to get data from the webpage https://bitinfocharts.com/comparison/price-btc.html and I have the code:
doc = SimplifiedDoc(html)
js = doc.getElementByText('new Dygraph', tag='script').html
js = js[js.find('document.getElementById("container"),') + len('document.getElementById("container"),'):]
js = js[:js.find(', {labels:')] # Get data part
js = js.replace('[new Date("', '').replace('")', '')[1:-2]
data = [kv.split(',') for kv in js.split('],')]
which works for other pages on the same website, but on the price page, it returns a AttributeError: 'NoneType' object has no attribute 'find'. The code for this page seems to be the same as the rest, so I don't why this one returns an error. For example on https://bitinfocharts.com/comparison/transactions-btc.html it works perfectly as intended.
It's wrong. I don't know why. You can use the following line.
from simplified_scrapy import SimplifiedDoc, utils, req
html = req.get('https://bitinfocharts.com/comparison/bitcoin-price.html')
doc = SimplifiedDoc(html)
# js = doc.getElementByText('new Dygraph', tag='script').html
js = doc.selects('script').contains('new Dygraph')[0].html # Change to this
js = js[js.find('document.getElementById("container"),') +
len('document.getElementById("container"),'):]
js = js[:js.find(', {labels:')] # Get data part
js = js.replace('[new Date("', '').replace('")', '')[1:-2]
data = [kv.split(',') for kv in js.split('],')]
I am scraping payment methods of website but all payment methods are added with the help of CSS. I don't know how to scrape that code. I tried to find on StackOverflow but unable to find any helpful material.
payment methods are given at the end of the page on the bottom left side.
payment_method = soup.find("div", class_="footer-second")
payyment_method = payment_method.find("div", class_="drz-footer-width-25 payment-column")
payment_method = payment_method.find_all("span")
this is the code that I used. But I don't have an idea of how I can scrape class images or image links I am unable to code further. There is no href or src link in tag only CSS class is used to show the icon on the page.
It comes from an image url. You would need to regex out the image url from the relevant source css file and then try some form of optical recognition software. The following gets you the url.
import requests, re
r = requests.get('https://laz-g-cdn.alicdn.com/lzdmod/desktop-footer-daraz/5.2.38/??pc/index.css')
p = re.compile(r'icon-yatra-v-pk{.*\.drz-footer-sprit{background-image:url\((.*?)\);')
image_url = 'https:' + p.findall(r.text)[0]
print(image_url)
Regex:
The css instructions place parts of this image "in frame" via the class attributes and css styling instructions. For example, enter .icon-yatra-payment-8 in browser and hit enter then examine the css instructions for the node; you will see background-position, width and height specified as well as the background image as inline-block. You will also see links to the source css files for these instructions.
I'm trying to download some images from tripadvisor using urllib but all that I get for the url in the src field from the html is this
I've done some research and I found out that those are lazy load images... Is there any way to download them??
You can extract a list of images from Javascript using the Beautiful Soup and json modules, then iterate over the list and retrieve the images you are interested in.
EDIT:
The problem was that the images have the same name, so they got overwritten. Fetching the first three images is trivial, but references to the other images in the carousel are not loaded until the carousel is opened, so that's trickier. For some images you can find a higher resolution version by substituting "photo-s" in the path with "photo-w", but figuring out which requires diving deeper into Javascript logic.
import urllib, re, json
from bs4 import BeautifulSoup as bs
def img_data_filter(tag):
if tag.name == "script" and tag.text.strip().startswith("var lazyImgs"):
return True
return False
response = urllib.urlopen("https://www.tripadvisor.it/Restaurant_Review-g3174493-d3164947-Reviews-Le_Ciaspole-Tret_Fondo_Province_of_Trento_Trentino_Alto_Adige.html")
soup = bs(response.read(), 'html.parser')
img_data = soup.find(img_data_filter)
js = img_data.text
js = js.replace("var lazyImgs = ", '')
js = re.sub(r";\s+var lazyHtml.+", '', js, flags=re.DOTALL)
imgs = json.loads(js)
suffix = 1
for img in imgs:
img_url = img["data"]
if not "media/photo-s" in img_url:
continue
img_name = img_url[img_url.rfind('/')+1:-4]
img_name = "%s-%03d.jpg" % (img_name, suffix)
suffix += 1
urllib.urlretrieve(img_url, img_name)
Game plan is to extract those main images, and display them in a thumbnail in the index page. I'm having so much trouble for this functionality, it seems like there's no example for this functionality in the internet.
I found three options
1. beautifulsoup// seems like people use this approach the most but I have no idea how beautifulsoup can find the representative image...also it requires the most work I think. 2. python goose// this looks legit. the documentation says it extracts main image, I guess I need to trust their words. problem is I don't know how to use this in django.
3. embedly//....maybe wrong choice for the functionality I need. I'm thinking to use python goose for this project.
My question is how would you approach this problem? and do you know any example or can provide some example I can look at? for extracting image from images user provide to my page I can probably use sorl-thumbnail(right?_) but for posted link....??
Edit1: using python goose, it seems (main)image scraping is very simple. problem is I'm not sure how to use the script to my app, how should I turn that image to right thumbnail and display on my index.html...
Here is my media.py(not sure if it works yet
import json
from goose import Goose
def extract(request):
url = request.args.get('url')
g = Goose()
article = g.extract(url=url)
resposne = {'image':article.top_image.src}
return json.dumps(resposne)
source: https://blog.openshift.com/day-16-goose-extractor-an-article-extractor-that-just-works/
the blog example is using flask, I tried to make the script for people using django
Edit 2: Ok, here is my approach. I really think this is right, but unfortunately it doesn't give me anything. no error or no image but the python syntax is right....if there's anyone why it's not working please let me know
Models.py
class Post(models.Model):
url = models.URLField(max_length=250, blank=True, null=True)
def extract(request, url):
url = requests.POST.get('url')
g = Goose()
article = g.extract(url=url)
resposne = {'image':article.top_image.src}
return json.dumps(resposne)
Index.html
{% if posts %}
{% for post in posts %}
{{ post.extract}}
{%endfor%}
{%endif%}
BeautifulSoup would be the way to go for this, and is actually remarkably easy.
To begin, an image in HTML looks like this:
<img src="http://www.url.to/image.png"></img>
We can use BeautifulSoup to extract all img tags and then find the src of the img tag. This is achieved as shown below.
from bs4 import BeautifulSoup #Import stuff
import requests
r = requests.get("http://www.site-to-extract.com/") #Download website source
data = r.text #Get the website source as text
soup = BeautifulSoup(data) #Setup a "soup" which BeautifulSoup can search
links = []
for link in soup.find_all('img'): #Cycle through all 'img' tags
imgSrc = link.get('src') #Extract the 'src' from those tags
links.append(imgSrc) #Append the source to 'links'
print(links) #Print 'links'
I don't know how you plan on deciding which image to use as thumbnail, but you can then through the list of URL's and extract the one you want.
Update
I know you said dJango, but I would highly recommend Flask. It's a lot simpler, yet still very functional.
I wrote this, which simply displays the 1st image of whatever webpage you give it.
from bs4 import BeautifulSoup #Import stuff
import requests
from flask import Flask
app = Flask(__name__)
def getImages(url):
r = requests.get(url) #Download website source
data = r.text #Get the website source as text
soup = BeautifulSoup(data) #Setup a "soup" which BeautifulSoup can search
links = []
for link in soup.find_all('img'): #Cycle through all 'img' tags
imgSrc = link.get('src') #Extract the 'src' from those tags
links.append(imgSrc) #Append the source to 'links'
return links #Return 'links'
#app.route('/<site>')
def page(site):
image = getImages("http://" + site)[0] #Here I find the 1st image on the page
if image[0] == "/":
image = "http://" + site + image #This creates a URL for the image
return "<img src=%s></img>" % image #Return the image in an HTML "img" tag
if __name__ == '__main__':
app.run(debug=True, host="0.0.0.0") #Run the Flask webserver
This hosts a web server on http://localhost:5000/
To input a site, do http://localhost:5000/yoursitehere, for example http://localhost:5000/www.google.com