I am opening an excel spreadsheet, reading the data in column b and using that data performing a URL lookup and regex on the page then saving the data found into columns 3 and 4 on the same spreadsheet. Everything works okay but it's only writing the last row. I think because of how my for loops are structured, I have searched numerous topics on here but haven't been able to fathom it yet.
import os
from openpyxl import load_workbook
import urllib
import re
from urllib2 import urlopen
directoryPath=r'C:\Python27' #The main folder
os.chdir(directoryPath)
folder_list=os.listdir(directoryPath)
for folders, sub_folders, file in os.walk(directoryPath): #Traversing the sub folders
for name in file:
if name.endswith(".xlsx"):
filename = os.path.join(folders, name)
wb=load_workbook(filename, data_only=True)
ws=wb.active
cell_range = ws['B2':'B4']
f = urllib.urlopen(cell.value)
s = f.read()
for row in cell_range: # This is iterating through rows 1-7
for cell in row: # This iterates through the columns(cells) in that row
ws.cell(row=ws._current_row, column=3).value= str(re.findall(r"[A-Za-z0-9._%+-]+#[A-Za-z0-9.-]+\.[A-Za-z]{2,4}",s))
ws.cell(row=ws._current_row, column=4).value= str(re.findall(r"\(?(?:(?:0(?:0|11)\)?[\s-]?\(?|\+)44\)?[\s-]?\(?(?:0\)?[\s-]?\(?)?|0)(?:\d{2}\)?[\s-]?\d{4}[\s-]?\d{4}|\d{3}\)?[\s-]?\d{3}[\s-]?\d{3,4}|\d{4}\)?[\s-]?(?:\d{5}|\d{3}[\s-]?\d{3})|\d{5}\)?[\s-]?\d{4,5}|8(?:00[\s-]?11[\s-]?11|45[\s-]?46[\s-]?4\d))(?:(?:[\s-]?(?:x|ext\.?\s?|\#)\d+)?)",s))
wb.save('scraper.xlsx')
example urls to test with
http://www.knaptoninsurance.co.uk/contact-us
http://www.sterlingsafetywear.co.uk/contact
http://www.ilu.org.uk/contact.html
Related
I am trying to build a function that iterates over a bunch of names in a CSV I give then extracts the last serial number written from JSON file then adding one for each name and putting serial number beside every name in the csv, but what i get is that the function generates the first serial number successfully and saves it in Json file but fails to add it in the csv via pandas and fails to update the number in the JSON file.
this is the code of the function:
from docx import Document
import pandas as pd
from datetime import datetime
import time
import os
from docx2pdf import convert
import json
date=datetime.date(datetime.now())
strdate=date.strftime("%d-%m-%Y")
year=date.strftime("%Y")
month=date.strftime("%m")
def genrateserial(a):
jsonFile1 = open("data_file.json", "r")
lastserial = jsonFile1.read()
jsonFile1.close()
for d in range(len(lastserial)):
if lastserial[d]=="\"":
lastserial[d].replace("\"","")
jsonFile1.close()
if strdate=="01" or (month[1]!=lastserial[8]):
num=1
last=f"JO/{year}{month}{num}"
data=f"{last}"
jsonstring=json.dumps(data)
jsonfile2=open("data_file.json", "w")
jsonfile2.write(jsonstring)
jsonfile2.close()
database = pd.read_csv(a)
df = pd.DataFrame(database)
df = df.dropna(axis=0)
for z in range(len(df.Name)):
newentry=f"JO/{year}{month}{num+1}"
jsonstring1=json.dumps(newentry)
jsonfile3=open("data_file.json","w")
jsonfile3.write(jsonstring1)
jsonfile3.close()
df.iloc[[z],3]=newentry
genrateserial('database.csv')
I need to read xml file and fetch data to a dataframe. I have developed this to extract data for one xml file.
import pandas as pd
import numpy as np
import xml.etree.cElementTree as et
import datetime
tree=et.parse('/data/dump_xml/1013.xml')
root=tree.getroot()
NAME = []
for name in root.iter('name'):
NAME.append(name.text)
print(NAME[0])
print(NAME[1])
UPDATE = []
for update in root.iter('lastupdate'):
UPDATE.append(update.text)
updated = datetime.datetime.fromtimestamp(int(UPDATE[0]))
lastupdate=updated.strftime('%Y-%m-%d %H:%M:%S')
ParaValue = []
for parameterevalue in root.iter('value'):
ParaValue.append(parameterevalue.text)
print(ParaValue[0])
print(ParaValue[1])
print(lastupdate,NAME[0],ParaValue[0])
print(lastupdate,NAME[1],ParaValue[1])
For one each file I need to get below two results as an output..
2022-05-23 11:25:01 in 1.5012356187e+05
2022-05-23 11:25:01 out 1.7723777592e+05
Now I need to do this to all my xml files in /data/dump_xml/ and make a df with all the data at one execution. Can someone help me to improve my code?
I have am writing a script that reads a folder of .pdfs and extracts their fillable fields to a pandas df. I had success extracting one .pdf with the following code:
import numpy as np
import pandas as pd
import PyPDF2
import glob, os
pwd = os.getcwd()
pdfFileObj = open('pdf_filename', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
fields_dict = pdfReader.getFormTextFields()
series = pd.Series(fields_dict).to_frame()
df = pd.DataFrame(pd.Series(fields_dict)).T
I want to build a function that runs this script for all pdfs in the directory. My first idea was to use a function in glob that collects all pdfs. Here is what I have so far:
import numpy as np
import pandas as pd
import PyPDF2
import glob, os
pwd = os.getcwd()
def readfiles():
os.chdir(pwd)
pdfs = []
for file in glob.glob("*.pdf"):
print(file)
pdfs.append(file)
pdfFileObj = open(readfiles, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
fields_dict = pdfReader.getFormTextFields()
series = pd.Series(fields_dict).to_frame()
df = pd.DataFrame(pd.Series(fields_dict)).T
Unfortunately, this doesn't work because I cannot put a function in the pdfFileReader. Does anyone have suggestions on a better way to do this? Thanks!
I can't comment, new account. But you could try making your readFiles function return the array pdfs.
Then in code execution below just:
listofPDF=readfiles()
arrayofDF=list()
for file in listofPDF:
pdfFileObj = open(file , 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
##execute your code to obtain a single dataframe from a pdf here
fields_dict = pdfReader.getFormTextFields()
series = pd.Series(fields_dict).to_frame()
df = pd.DataFrame(pd.Series(fields_dict)).T
arrayofDF.append(df)
You would end up having a list of dataframes, each one corresponding to one of the pdf files, if the first part of the code ( in which you get the dataframe from the singular pdf file) works.
Additionally, you could make a dictionary like {filename:file , dataframe: df} and then append that to your list, so you can later recover the dataframe based of the name of the file. It all depends on what you plan to do with the dataframes later.
import pandas as pd
import requests
from bs4 import BeautifulSoup
df = pd.read_csv('env_sequences.csv')
Namedf = df['Name']
Uniprotdf = df['ID']
for row in Uniprotdf:
theurl = 'https://www.uniprot.org/uniprot/' + row + '.fasta'
page = requests.get(theurl).content
for row in Namedf:
fasta = open(row + '.txt', 'w')
fasta.write(page)
fasta.close()
#Sample website: https://www.uniprot.org/uniprot/P04578.fasta
I have a .csv file, from which I am using the column 'ID' to generate links to websites from which I want to download the content and save it as the corresponding name from the 'Name' column within the same .csv.
The code ceases to work after the second for loop in which I get a TypeError for trying to use the page variable within the fasta.write() function. Yet, If I print(page) I am able to output the text that I'm looking to have in each file. Is this a case of me having to convert html into a string? I am unsure how to proceed from here.
For the given url, if you print the content of the page, you'll notice that it has 'b'' which indicates it's in binary format.
print (page)
b'>sp|P04578|ENV_HV1H2 Envelope glycoprotein gp160 OS=Human immunodeficiency virus type 1 group M subtype B (isolate HXB2) OX=11706 GN=env PE=1 SV=2\nMRVKEKYQHLWRWGWRWGTMLLGMLMICSATEKLWVTVYYGVPVWKEATTTLFCASDAKA\nYDTEVHNVWATHACVPTDPNPQEVVLVNVTENFNMWKNDMVEQMHEDIISLWDQSLKPCV\nKLTPLCVSLKCTDLKNDTNTNSSSGRMIMEKGEIKNCSFNISTSIRGKVQKEYAFFYKLD\nIIPIDNDTTSYKLTSCNTSVITQACPKVSFEPIPIHYCAPAGFAILKCNNKTFNGTGPCT\nNVSTVQCTHGIRPVVSTQLLLNGSLAEEEVVIRSVNFTDNAKTIIVQLNTSVEINCTRPN\nNNTRKRIRIQRGPGRAFVTIGKIGNMRQAHCNISRAKWNNTLKQIASKLREQFGNNKTII\nFKQSSGGDPEIVTHSFNCGGEFFYCNSTQLFNSTWFNSTWSTEGSNNTEGSDTITLPCRI\nKQIINMWQKVGKAMYAPPISGQIRCSSNITGLLLTRDGGNSNNESEIFRPGGGDMRDNWR\nSELYKYKVVKIEPLGVAPTKAKRRVVQREKRAVGIGALFLGFLGAAGSTMGAASMTLTVQ\nARQLLSGIVQQQNNLLRAIEAQQHLLQLTVWGIKQLQARILAVERYLKDQQLLGIWGCSG\nKLICTTAVPWNASWSNKSLEQIWNHTTWMEWDREINNYTSLIHSLIEESQNQQEKNEQEL\nLELDKWASLWNWFNITNWLWYIKLFIMIVGGLVGLRIVFAVLSIVNRVRQGYSPLSFQTH\nLPTPRGPDRPEGIEEEGGERDRDRSIRLVNGSLALIWDDLRSLCLFSYHRLRDLLLIVTR\nIVELLGRRGWEALKYWWNLLQYWSQELKNSAVSLLNATAIAVAEGTDRVIEVVQGACRAI\nRHIPRRIRQGLERILL\n'
Changing the 'w' to 'wb' while opening the file should fix it. Also, using with open () is the more pythonic way of handling files.
for row in Namedf:
with open ('url.txt','wb') as fasta:
file.write(page)
I have a python code that takes multiple text files as input and generates output in separate CSV file so if my text files are ABC.txt and XYX.txt then my code is generating output in 2 CSV files ABC.csv and XYX.csv. My ultimate goal is get one single CSV file with all the outputs. Since I am more comfortable with sql I was thinking about uploading all the files to a database and then combine them using sql but I was wondering if I can modify my python code below to generate one single CSV file containing all output. Here is my code:
import json
from watson_developer_cloud import ToneAnalyzerV3Beta
import urllib.request
import codecs
import csv
import os
import re
import sys
import collections
import glob
import xlwt
from bs4 import BeautifulSoup
ipath = 'C:/TEMP/' # input folder
opath = 'C:/TEMP/' # output folder
reader = codecs.getreader("utf-8")
tone_analyzer = ToneAnalyzerV3Beta(
url='https://gateway.watsonplatform.net/tone-analyzer/api',
username='1f2fd51b-d0fb-45d8-aba2-08e22777b77d',
password='DykYfXjV4UXP',
version='2016-02-11')
path = 'C:/TEMP/*.html'
file = glob.glob(path)
# iterate over the list getting each file
writer = csv.writer(open('C:/TEMP/test', mode='w'))
# now enter our input loop
for fle in file:
# open the file and then call .read() to get the text
with open(fle) as f:
...
# output tone name and score to file
for i in tonename:
writer.writerows((tone['tone_name'],tone['score']) for tone in cat['tones'])
Modifying your existing code as little as possible ... you simply need to open the csv file before entering your loop that reads the text files:
...
path = 'C:/TEMP/*.html'
file = glob.glob(path)
# !! open our output csv
writer = csv.writer(open('our-merged-data', mode='w'))
# iterate over the list getting each file
for fle in file:
# open the file and then call .read() to get the text
with open(fle) as f:
...
# output tone name and score to file
for i in tonename:
writer.writerows((tone['tone_name'],tone['score'],Date,Title) for tone in cat['tones'])