Data not being written to file - python

I am writing a python program that uses scrapy spider to crawl a website and looks for pdf files. Once a pdf file is found, it downloads the pdf file to a folder and opens and reads the pdf to grab info from it (author, title, number of pages and if the document is tagged or not). This data is written in CSV file.
Everything seems to work fine, but when I open the CSV file, there is not nearly as much data in it as I expected. For example, I will crawl a site and find 40 pdf files, but the CSV has entries for only 14 pdf's.
pdfspider.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
import urllib.parse as urlparse
import os.path
import validators
import csv
from .. info import isTagged
from .. get_metadata import get_data
class PdfspiderSpider(CrawlSpider):
name = 'pdfspider'
allowed_domain = input('Enter the domain name of the website to be crawled (domain of https://google.com is "google"): ')
allowed_domains = [allowed_domain]
#need domain to name folder pdfs will be put into
global domain
domain = allowed_domains[0]
global start
start = input('Enter the url of the page you wish to start the crawl on (include http/https): ')
start_urls = [start]
global base_path
base_path = input('Where do you wish to save the folder containing the pdfs?: ')
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
base_url = start
for a in response.xpath('//a[#href]/#href'):
link = a.extract()
if link.endswith('.pdf'):
link = urlparse.urljoin(base_url, link)
yield Request(link, callback=self.save_pdf)
def create_csv(self):
header = ['location', 'title', 'author', '# of pages', 'tagged?']
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, 'x')
writer = csv.writer(f)
writer.writerow(header)
f.close()
def save_pdf(self, response):
url=response.url
if response.status == 200:
save_dir = base_path + '/' + domain
isExist = os.path.exists(save_dir)
if not isExist:
# Create a new directory because it does not exist
os.makedirs(save_dir)
csvFile = domain + '.csv'
csvPath = save_dir + '/' + csvFile
csvPathExist = os.path.exists(csvPath)
if not csvPathExist:
self.create_csv()
file = response.url.split('/')[-1]
full_path = os.path.join(save_dir, file)
with open(full_path, 'wb') as f:
f.write(response.body)
metaData = get_data(full_path)
is_tagged = isTagged(full_path)
row = [url, metaData[0], metaData[1], metaData[2], is_tagged]
self.add_to_csv(row)
else:
print(f"Failed to load pdf: {url}")
def add_to_csv(self,row):
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, "a")
writer = csv.writer(f)
writer.writerow(row)
f.close()
info.py
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.pdfdevice import TagExtractor
from pdfminer3.pdfpage import PDFPage
from io import BytesIO
def isTagged(path, password=''):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
try:
try:
device = TagExtractor(rsrcmgr, retstr, codec='utf-8')
except:
print('Not utf-8.')
try:
device = TagExtractor(rsrcmgr, retstr, codec='ascii')
except:
print('Not ascii.')
except Exception as ex:
print(ex)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
maxpages = 100
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
interpreter.process_page(page)
contents = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
# check if common proprietary Acrobat tags are in the response
tags = ["<b\'Part\'", "</b\'Sect\'", "</b\'Art\'", "<b'Content'", "<b\'Artifact\'"]
for tag in tags:
if tag in contents:
return 'tagged'
break
else:
continue
return 'not tagged'
get_metadata.py
from pdfminer3.pdfparser import PDFParser
from pdfminer3.pdfdocument import PDFDocument
from pdfminer.high_level import extract_pages
def get_data(file):
fp = open(file, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
author = doc.info[0]["Author"]
title = doc.info[0]["Title"]
numPages = len(list(extract_pages(file)))
return title, author, numPages
Can anyone see what I am doing wrong? This is my first python program so I am pretty green. I am running python 3.9.7

Related

Python only read the first PDF FIle

I am using following code for the test purpose. However it only reads the first pdf of the directory. My pdf files name is like test , test1 and test2. But it only reads the test. I can also see the list of all pdf in the directory but the read function only reads the first pdf, I am not sure my line of code for +file_name, it should be like that or something else.
I request for help please. Below is my code for reference
#date
from datetime import*
import PyPDF2
import os
import re
today_date = datetime.today()
print('Today is:' , today_date)
#file list
for file_name in os.listdir(r"C:\\Nikhar\Work\Quantum\Work"):
print(file_name)
#read all file in directory
load_pdf = open(r"C:\\Nikhar\\Work\\Quantum\\Work\\"+file_name, "rb")
read_pdf = PyPDF2.PdfFileReader(load_pdf)
page_count = read_pdf.getNumPages()
first_page = read_pdf.getPage(0)
page_content = first_page.extractText()
page_content = page_content.replace('\n', '')
print(page_content)
You have to indent the code to make it executed in each loop like this :
#date
from datetime import*
import PyPDF2
import os
import re
today_date = datetime.today()
print('Today is:' , today_date)
#file list
for file_name in os.listdir(r"C:\\Nikhar\Work\Quantum\Work"):
print(file_name)
#read all file in directory
load_pdf = open(r"C:\\Nikhar\\Work\\Quantum\\Work\\"+file_name, "rb")
read_pdf = PyPDF2.PdfFileReader(load_pdf)
page_count = read_pdf.getNumPages()
first_page = read_pdf.getPage(0)
page_content = first_page.extractText()
page_content = page_content.replace('\n', '')
print(page_content)
You must simply indent the code that should be executed in the for loop:
#date
from datetime import*
import PyPDF2
import os
import re
today_date = datetime.today()
print('Today is:' , today_date)
#file list
for file_name in os.listdir(r"C:\\Nikhar\Work\Quantum\Work"):
print(file_name)
#read all file in directory
load_pdf = open(r"C:\\Nikhar\\Work\\Quantum\\Work\\"+file_name, "rb")
read_pdf = PyPDF2.PdfFileReader(load_pdf)
page_count = read_pdf.getNumPages()
first_page = read_pdf.getPage(0)
page_content = first_page.extractText()
page_content = page_content.replace('\n', '')
print(page_content)

Download a PDF from url, edit it an render it in Django

I need to download a PDF form Azure Storage, edit the file (extract an specific page) and render it from a Django view, I have this:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with io.BytesIO() as out:
pdfWriter.write(out)
But I can't achieve to render the pdf from the Django view, I don't want to use open because I had issue in production by doing this.
EDIT1:
This did work for me but NOT in production:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('media/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('media/test.pdf', 'rb'), content_type='application/pdf')
EDIT2:
This works but had to change the /media path for another one, not sure if is the best solution yet:
import urllib.request
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
import io
def download_desprendible_user(request):
if request.method == 'POST':
url = 'https://example.blob.core.windows.net/mypdf.pdf' # The url where I take the pdf file
file = urllib.request.urlopen(url)
id = 'foo' # Key word I want to find within the pdf to extract that page
reader = PdfFileReader(io.BytesIO(file.read()))
for i in range(0, reader.getNumPages()):
content = ""
content += reader.getPage(i).extractText() + "\n"
ResSearch = re.search(id, content)
if ResSearch is not None:
page = i
break
pdfWriter = PdfFileWriter()
pdfWriter.addPage(reader.getPage(page))
with open('test/test.pdf', 'wb') as f:
pdfWriter.write(f)
f.close()
return FileResponse(open('test/test.pdf', 'rb'), content_type='application/pdf')

Check row if exist in csv Python

I'm new to Python and I'm copying a website and storing it in a csv file, but I can't verify that it exists in the csv file. I imagine that the correct thing would be to scroll through the lines to check if the title already exists. Does anyone have any idea how to solve this? See the code below:
from pathlib import Path
import time
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
DRIVER_PATH = str(Path('geckodriver').resolve())
def write_csv(ads):
filename = 'results.csv'
with open(filename, 'a+') as f:
fields = ['title', 'url']
writer = csv.DictWriter(f, fieldnames=fields)
existing_lines = csv.reader(f)
for ad in ads:
if ad['title'] not in existing_lines:
print(existing_lines)
writer.writerow(ad)
print('success')
else:
print('fail')
def get_html(url):
browser = webdriver.Firefox(executable_path=DRIVER_PATH)
browser.get(url)
return browser.page_source
def scrapde_data(card):
try:
h2 = card.h2
except:
title = ''
url = ''
else:
title = h2.text.strip()
try:
url = card.find('a').get('href')
except:
url = ''
data = {'title': title, 'url': url}
return data
def main():
while True:
url = '#'
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
cards = soup.find_all('div', {"class": "produto--comprar"})
ads_data = []
for card in cards:
data = scrapde_data(card)
ads_data.append(data)
write_csv(ads_data)
time.sleep(5)
if __name__ == '__main__':
main()
Help me please? :(
Try the following code:-
def write_csv(ads):
filename = 'results.csv'
with open(filename, 'a+', newline='') as f:
fields = ['title', 'url']
writer = csv.DictWriter(f, fieldnames=fields)
#moving file pointer at the start of the file
f.seek(0)
existing_lines = csv.reader(f)
#finding no of lines in the file
count=0
for line in existing_lines:
count+=1
break
#if file is not empty
if count>0:
for ad in ads:
flag=0
#moving file pointer to the start of the file
f.seek(0)
#checking if ad['title'] is present in the first column of csv file
for line in existing_lines:
if ad['title'] in line[0]:
flag=1
#if ad['title'] is not found
if flag==0:
writer.writerow(ad)
#if file is empty write the dictionary contents into the csv
else:
for ad in ads:
writer.writerow(ad)

How to read from PDF and save as CSV, using Python?

There is this URL https://www.jpx.co.jp/english/listing/stocks/new/index.html#3422
I wrote(copy&paste from internet!) the following code to save all the pdfs which are inside the table in a folder
from PyPDF2 import PdfFileReader
import requests
from bs4 import BeautifulSoup
import io
import urllib.request as req
import urllib
import os
import time
from urllib.parse import urljoin
url = 'https://www.jpx.co.jp/english/listing/stocks/new/index.html'
headers = {'User-Agent':'Mozilla/5.0'}
res = req.urlopen(url)
soup = BeautifulSoup(res, "html.parser")
result = soup.select("a[href]")
link_list =[]
for link in result:
href = link.get("href")
link_list.append(href)
pdf_list = [temp for temp in link_list if temp.endswith('pdf')]
print(pdf_list)
abs_pdf_list = []
for relative in pdf_list:
temp_url = urljoin(url, relative)
abs_pdf_list.append(temp_url)
filename_list = []
for target in abs_pdf_list:
temp_list = target.split("/")
filename_list.append(temp_list[len(temp_list)-1])
newpath = r'/Users/myfolder/python/IPO'
if not os.path.exists(newpath):
os.makedirs(newpath)
target_dir = "/Users/myfolder/python/IPO/"
savepath_list = []
for filename in filename_list:
savepath_list.append(os.path.join(target_dir, filename))
savepath_list
for (pdflink, savepath) in zip(abs_pdf_list, savepath_list):
print(pdflink)
urllib.request.urlretrieve(pdflink, savepath)
time.sleep(2)
import pdfplumber
import re
def download_file(url):
local_filename = url.split('/')[-1]
with requests.get(url) as r:
with open(local_filename, 'wb') as f:
f.write(r.content)
return local_filename
ap_url = abs_pdf_list[0]
ap = download_file(ap_url)
with pdfplumber.open(ap) as pdf:
page1 = pdf.pages[0]
page2 = pdf.pages[1]
text = page1.extract_text()
print(text)
Now I need to read those pdfs and extract the below lines,
From page1
line which start with "Information & Communication"
From page2
lines which start with
"Book-building Period"
"Offering Price"
and save them in one Excel or CSV file
Sadly I reached to my coding skill limit and can’t move any further .I convert the pdf to text,but …
Please advice me how to do this
I would recommend installing our new package, pdftextract, that conserves the pdf layout as best as possible to extract text, then using some regex to extract the keywords.
Here's a working code snippet tested on 2 pdf files from your link:
import re
import csv
from pdftextract import XPdf
pdf_files = ['a.pdf', "b.pdf"]
keywords = ["Information & Communication", "Book-building Period", "Offering Price"]
def extract_infos(file:str, keywords:list):
"""extract the text from the pdf file then get the wanted keywords information"""
# extracting the text from pdf while keeping the original layout
pdf = XPdf(file)
txt = pdf.to_text(keep_layout=True)
row = []
# getting the keywords information
for keyword in keywords:
# search for the keyword
pattern = "{} (.+)\r".format(keyword) # extracting the wanted info
regex = re.compile(pattern, flags=re.I| re.M)
m = regex.search(txt)
if m is not None:
m = m.groups()[0].strip(' /\r') # strip unwanted space and characters
row.append(m)
return row
def main(files:list, fname:str, headers:list):
"""extract the wanted info from a bunch of pdf files and save them as csv file"""
with open(fname, "w") as wf:
writer = csv.writer(wf)
writer.writerow(headers)
for i, file in enumerate(files, start=1):
row = extract_infos(file, headers)
writer.writerow(row)
print("[DONE]", "writed {} rows to {}.".format(i, fname))
main(pdf_files, "stocks.csv", keywords)

Downloading a image using Python Mechanize

I'm trying to write a Python script to download a image and set it as my wallpaper. Unfortunately, the Mechanize documentation is quite poor. My script is following the link correctly, but I'm having a hard time to actually save the image on my computer. From what I researched, the .retrieve() method should do the work, but How do I specify the path to where the file should be downloaded to? Here is what I have...
def followLink(browser, fixedLink):
browser.open(fixedLink)
if browser.find_link(url_regex = r'1600x1200'):
browser.follow_link(url_regex = r'1600x1200')
elif browser.find_link(url_regex = r'1400x1050'):
browser.follow_link(url_regex = r'1400x1050')
elif browser.find_link(url_regex = r'1280x960'):
browser.follow_link(url_regex = r'1280x960')
return
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
data = browser.open(image['src']).read()
browser.back()
save = open(filename, 'wb')
save.write(data)
save.close()
This can help you download all the images from a web page. As for parsing html you'd better use BeautifulSoup or lxml. And download is just read the data and then write it to a local file. You should assign your own value to dir. It is where you images exist.
Not sure why this solution hasn't come up, but you can use the mechanize.Browser.retrieve function as well. Perhaps this only works in newer versions of mechanize and has thus not been mentioned?
Anyway, if you wanted to shorten the answer by zhangyangyu, you could do this:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
browser.retrieve(image['src'], filename)
browser.back()
Also keep in mind that you'll likely want to put all of this into a try except block like this one:
import mechanize, os
from BeautifulSoup import BeautifulSoup
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
for image in image_tags:
filename = image['src'].lstrip('http://')
filename = os.path.join(dir, filename.replace('/', '_'))
try:
browser.retrieve(image['src'], filename)
browser.back()
except (mechanize.HTTPError,mechanize.URLError) as e:
pass
# Use e.code and e.read() with HTTPError
# Use e.reason.args with URLError
Of course you'll want to adjust this to your needs. Perhaps you want it to bomb out if it encounters an issue. It totally depends on what you want to achieve.
You can get/download the image by opening the url of the img src.
image_response = browser.open_novisit(img['src'])
to save the file now, just use fopen:
with open('image_out.png', 'wb') as f:
f.write(image_response.read())
It's really crappy but It "works" for me, with 0xc0000022l anwer's
import mechanize, os
from BeautifulSoup import BeautifulSoup
import urllib2
def DownloadIMGs(url): # IMPORTANT URL WITH HTTP OR HTTPS
print "From", url
dir = 'F:\Downloadss' #Dir for Downloads
basicImgFileTypes = ['png','bmp','cur','ico','gif','jpg','jpeg','psd','raw','tif']
browser = mechanize.Browser()
html = browser.open(url)
soup = BeautifulSoup(html)
image_tags = soup.findAll('img')
print "N Images:", len(image_tags)
print
#---------SAVE PATH
#check if available
if not os.path.exists(dir):
os.makedirs(dir)
#---------SAVE PATH
for image in image_tags:
#---------SAVE PATH + FILENAME (Where It is downloading)
filename = image['src']
fileExt = filename.split('.')[-1]
fileExt = fileExt[0:3]
if (fileExt in basicImgFileTypes):
print 'File Extension:', fileExt
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1])
num = filename.find(fileExt) + len(fileExt)
filename = filename[:num]
else:
filename = filename.replace('?', '_')
filename = os.path.join(dir, filename.split('/')[-1]) + '.' + basicImgFileTypes[0]
print 'File Saving:', filename
#---------SAVE PATH + FILENAME (Where It is downloading)
#--------- FULL URL PATH OF THE IMG
imageUrl = image['src']
print 'IMAGE SRC:', imageUrl
if (imageUrl.find('http://') > -1 or imageUrl.find('https://') > -1):
pass
else:
if (url.find('http://') > -1):
imageUrl = url[:len('http://')]
imageUrl = 'http://' + imageUrl.split('/')[0] + image['src']
elif(url.find('https://') > -1):
imageUrl = url[:len('https://')]
imageUrl = 'https://' + imageUrl.split('/')[0] + image['src']
else:
imageUrl = image['src']
print 'IMAGE URL:', imageUrl
#--------- FULL URL PATH OF THE IMG
#--------- TRY DOWNLOAD
try:
browser.retrieve(imageUrl, filename)
print "Downloaded:", image['src'].split('/')[-1]
print
except (mechanize.HTTPError,mechanize.URLError) as e:
print "Can't Download:", image['src'].split('/')[-1]
print
pass
#--------- TRY DOWNLOAD
browser.close()
DownloadIMGs('https://stackoverflow.com/questions/15593925/downloading-a-image-using-python-mechanize')

Categories