import sys
import os
import urllib
from xml.etree.ElementTree import ElementTree
from xml.etree.ElementTree import tostring
import flickrapi
api_key = ' '
api_password = ' '
photo_id='2124494179'
flickr= flickrapi.FlickrAPI(api_key, api_password)
#photos= flickr.photos_getinfo(photo_id='15295705890')
#tree=ElementTree(flickr.photos_getinfo(photo_id))
#image_id=open('photoIds.txt','r')
#Image_data=open('imageinformation','w')
#e=image_id.readlines(10)
#f= [s.replace('\r\n', '') for s in e]
#num_of_lines=len(f)
#image_id.close()
#i=0
#while i<269846:
# term=f[i]
#try:
photoinfo=flickr.photos_getinfo(photo_id=photo_id)
photo_tree=ElementTree(photoinfo)
#photo_tree.write('photo_tree')
#i+=1
#photo=photo_tree.getroot()
#photodata=photo.getiterator()
#for elem in owner.getiterator():
#for elem in photo.getiterator():
for elem in photo_tree.getroot():
farm=elem.attrib['farm']
id=elem.attrib['id']
server=elem.attrib['server']
#title=photo_tree.find('title').txt
#for child in elem.findall():
# username=child.attrib['username']
# location=child.attrib['location']
# user=elem.attrib['username']
print (farm)
print(id)
print(server)
#owner=photo_tree.findall('owner')
# print(username)
#filename="%s.txt"%(farm)
#f=open(filename,'w')
#f.write("%s"%farm)
#for elem in photo_tree.getiterator():
#for child in photo_tree.getiterator():
#print (child.attrib)
#owner=child.attrib['username']
I would like to read data from a file and pass it to flickrapi method to get images' information recursively using pythonand save it in a file as a text: image id=.... user name=... location=... tags=... and so on. I could save the attributes of the first element by using .getroot() but I tried to get the attributes of other element but it returns error. I want to save the attributes into txt file and read the image ids from a file so I can use these data in the algorithm I'm working on.
Since I figured out a away to solve the problem(I'm a beginner and know almost nothing about python), what we need to do is to iterator the object(since it's not saved as xml file) using tags name as follows:
photo_tree=ElementTree(photoinfo)
for elem in photo_tree.getroot():
uploaded=elem.attrib['dateuploaded']
uploaded=datetime.datetime.fromtimestamp(float(uploaded)).strftime('%Y-%m-%d %H:%M:%S')
for elem in photo_tree.getiterator(tag='dates'):
taken_date=elem.attrib['taken']
photo_info = open(head + 'filename/' + ('%d.txt') % (id),'a')
photo_info.write(str(id)+'\t'+uploaded+'\t'+taken_date+'\t'+'\n')
may it helps someone who is seeking a solution for same problem. Or may be there is an efficient way to solve this issue!!
Related
I am trying to copy elements of a doc from one doc file to other. The text part is easy, the images is where it gets tricky.
Attaching an image to explain the structure of the doc: Just some text and 1 image.
from docx import Document
import io
doc = Document('/Users/neha/Desktop/testing.docx')
new_doc = Document()
for elem in doc.element.body:
new_doc.element.body.append(elem)
new_doc.save('/Users/neha/Desktop/out.docx')
This gets me the whole structure of the doc in the new_doc but the image is still blank. Image below:
Good thing is I have the blank image in the right place so I thought of getting the byte level data from the previous image and insert it in the new doc. Here is how I extended the above code:
from docx import Document
import io
doc = Document('/Users/neha/Desktop/testing.docx')
new_doc = Document()
for elem in doc.element.body:
new_doc.element.body.append(elem)
im = doc.inline_shapes[0]
blip = im._inline.graphic.graphicData.pic.blipFill.blip
rId = blip.embed
doc_part = doc.part
image_part = doc_part.related_parts[rId]
bytes = image_part._blob #Here I get the byte level data for the image
im2 = new_doc.inline_shapes[0]
blip2 = im2._inline.graphic.graphicData.pic.blipFill.blip
rId2 = blip2.embed
document_part2 = new_doc.part
document_part2.related_parts[rId2]._blob = bytes
new_doc.save('/Users/neha/Desktop/out.docx')
But the image still shows empty in the new_doc. What should I do from here?
I figured out a solution a couple of days back. However the text loses formatting using this way, but the images are correctly placed.
So the idea is, for para in paras for the source doc, if there is text, I write it to dest doc. And if there is an inline image present, I add a unique identifier at that place in the dest doc (refer here to see how these identifiers work, and contexts in docxtpl). These identifiers and docxtpl proved to be particularly useful here. And then using those unique identifiers I create a 'context' (as shown below) which is basically a map mapping the unique identifier to its particular InlineImage, and finally I render this context..
Below is my code (Apologies for the unnecessary indentation, I copied it directly from my text editor, and shift+tab doesn't work here :P)
from docxtpl import DocxTemplate, InlineImage
import Document
import io
import xml.etree.ElementTree as ET
dest = DocxTemplate()
source = Document(source_path)
context = {}
ims = [im for im in source.inline_shapes]
im_addresses = []
im_streams = []
count = 0
for im in ims:
blip = im._inline.graphic.graphicData.pic.blipFill.blip
rId = blip.embed
doc_part = source.part
image_part = doc_part.related_parts[rId]
byte_data = image_part._blob
image_stream = io.BytesIO(byte_data)
im_streams.append(image_stream)
image_name = self.img_path+"img_"+"_"+str(count)+".jpeg"
with open(image_name, "wb") as fh:
fh.write(byte_data)
fh.close()
im_addresses.append(image_name)
count += 1
paras = source.paragraphs
im_idx = 0
for para in paras:
p = dest.add_paragraph()
r = p.add_run()
if(para.text):
r.add_text(para.text)
root = ET.fromstring(para._p.xml)
namespace = {'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}
inlines = root.findall('.//wp:inline',namespace)
if(len(inlines) > 0):
uid = "img_"+str(im_idx)
r.add_text("{{ " + uid + " }}")
context[uid] = InlineImage(dest,im_addresses[im_idx])
im_idx += 1
try:
dest.render(context)
except Exception as e:
print(e)
dest.save(dest_path)
PS: If a paragraph has two images, this code will prove to be sub-optimal.. One will have to make some change in the following:
if(len(inlines) > 0):
uid = "img_"+str(im_idx)
r.add_text("{{ " + uid + " }}")
context[uid] = InlineImage(dest,im_addresses[im_idx])
im_idx += 1
Will have to add a for loop inside the if statement as well. Since I didn't need as usually my images were big enough, so they always came in different paragraphs. Just a side note for anyone who may need it..
Cheers!
You could try:
Extracting the images from the first document by unzipping the .docx file (per How can I search a word in a Word 2007 .docx file?)
Save those images to the file system (as foo.png, for instance)
Generate the new .docx file with Python and add the .png file using document.add_picture('foo.png').
This problem is solved by this package https://docxtpl.readthedocs.io/en/latest/
I need to get data from this API https://api.storj.io/contacts/f52624d8ef76df81c40853c22f93735581071434 (sample node)
This is my code (python):
import requests
f = requests.get('https://api.storj.io/contacts/f52624d8ef76df81c40853c22f93735581071434')
print f.text
I want to save only protocol, responseTime and reputation in three subsequent lines of the txt file. It's supposed to look something like this::
protocol: 1.2.0
responseTime: 8157.912472694088
reputation: 1377
Unfortunately, I'm stuck at this point and I can not process this data in any way
import requests
f = requests.get('https://api.storj.io/contacts/f52624d8ef76df81c40853c22f93735581071434')
# Store content as json
answer = f.json()
# List of element you want to keep
items = ['protocol', 'responseTime', 'reputation']
# Display
for item in items:
print(item + ':' + str(answer[item]))
# If you want to save in a file
with open("Output.txt", "w") as text_file:
for item in items:
print(item + ':' + str(answer[item]), file=text_file)
Hope it helps! Cheers
You just need to transform to a JSON object to be able to access the keys
import requests
import simplejson as json
f = requests.get('https://api.storj.io/contacts/f52624d8ef76df81c40853c22f93735581071434')
x = json.loads(f.text)
print 'protocol: {}'.format(x.get('protocol'))
print 'responseTime: {}'.format(x.get('responseTime'))
print 'reputation: {}'.format(x.get('reputation'))
This is a very unrefined way to do what you want that you could build off of. You'd need to sub in a path/filename for text.txt.
import requests
import json
f = requests.get('https://api.storj.io/contacts/f52624d8ef76df81c40853c22f93735581071434')
t = json.loads(f.text)
with open('text.txt', 'a') as mfile:
mfile.write("protocol: {0}".format(str(t['protocol'])))
mfile.write("responseTime: {0}".format(str(t['responseTime'])))
mfile.write("reputation: {0}".format(str(t['reputation'])))
I'm try to iterate through tables in html by a searchlabel, then update the found value to a dictionary, then write those values to a csv. The output currently works for both the url and the headline, but the name output will either be blank or show "None." If i print the output of blog["name'] however, it is correctly pulling the information I want. I suspect that it's an indentation error but I can't figure out where to line things up. I've tried moving things around but nothing seems to work to get the name assignment to work inside that loop.
import os
from bs4 import BeautifulSoup
import my_csv_writer
def td_finder(tr, searchLabel):
value = ""
index = tr.text.find(searchLabel)
if index>-1:
tds = tr.findAll('td')
if len(tds)>1:
value = tds[1].text
return value
def main():
topdir = 'some_directory'
writer = my_csv_writer.CsvWriter("output.csv")
writer.writeLine(["url", "headline", "name"])
"""Main Function"""
blog = []
for root, dirs, files in os.walk(topdir):
for f in files:
url = os.path.join(root, f)
url = os.path.dirname(url).split('some_file')[1]
if f.lower().endswith((".html")):
file_new = open(os.path.join(root, f), "r").read()
soup = BeautifulSoup(file_new)
blog = {}
#Blog Title
blog["title"] = soup.find('title').text
for table in soup.findAll("table"):
for tr in table.findAll("tr"):
#name
blog["name"] = td_finder(tr, "name:")
seq = [url, unicode(blog["title"]), unicode(blog.get("name"))]
writer.writeLine(seq)
#return ""
if __name__ == '__main__':
main()
print "Finished main"
You're writing unicode strings to a csv file which according to the official docs "The csv module doesn’t directly support reading and writing Unicode...".
It does offer alternative classes to enable different encodings via UnicodeWriter. The following answer from Boud on SO highlights the need to set the desired encoding in the CSV file.
I am trying to read all the links in the tag and then trying to create wiki links out of it...basically I want to read each link from the xml file and then create wiki links with the last word(please see below on what I mean by lastword) of the link...for somereason am running into following error,what I am missing,please suggest
http://wiki.build.com/ca_builds/CIT (last word is CIT)
http://wiki.build.com/ca_builds/1.2_Archive(last word is 1.2_Archive)
INPUT XML:-
<returnLink>
http://wiki.build.com/ca_builds/CIT
http://wiki.build.com/ca_builds/1.2_Archive
</returnLink>
PYTHON code
def getReturnLink(xml):
"""Collects the link to return to the PL home page from the config file."""
if xml.find('<returnLink>') == -1:
return None
else:
linkStart=xml.find('<returnLink>')
linkEnd=xml.find('</returnLink>')
link=xml[linkStart+12:linkEnd].strip()
link = link.split('\n')
#if link.find('.com') == -1:
#return None
for line in link:
line = line.strip()
print "LINE"
print line
lastword = line.rfind('/') + 1
line = '['+link+' lastword]<br>'
linklis.append(line)
return linklis
OUTPUT:-
line = '['+link+' lastword]<br>'
TypeError: cannot concatenate 'str' and 'list' objects
EXPECTED OUTPUT:-
CIT (this will point to http://wiki.build.com/ca_builds/CIT
1.2_Archive (this will point to http://wiki.build.com/ca_builds/1.2_Archive 1.2_Archive)
Python standard library has xml parser. You can also support multiple <returnLink> elements and Unicode words in an url:
import posixpath
import urllib
import urlparse
from xml.etree import cElementTree as etree
def get_word(url):
basename = posixpath.basename(urlparse.urlsplit(url).path)
return urllib.unquote(basename).decode("utf-8")
urls = (url.strip()
for links in etree.parse(input_filename_or_file).iter('returnLink')
for url in links.text.splitlines())
wikilinks = [u"[{} {}]".format(url, get_word(url))
for url in urls if url]
print(wikilinks)
Note: work with Unicode internally. Convert the text to bytes only to communicate with outside world e.g., when writing to a file.
Example
[http://wiki.build.com/ca_builds/CIT#some-fragment CIT]
[http://wiki.build.com/ca_builds/Unicode%20%28%E2%99%A5%29 Unicode (♥)]
Intead of parsing XML by hand, use a library like lxml:
>>> s = """<returnLink>
... http://wiki.build.com/ca_builds/CIT
... http://wiki.build.com/ca_builds/1.2_Archive
... </returnLink>"""
>>> from lxml import etree
>>> xml_tree = etree.fromstring(s)
>>> links = xml_tree.text.split()
>>> for i in links:
... print '['+i+']'+i[i.rfind('/')+1:]
...
[http://wiki.build.com/ca_builds/CIT]CIT
[http://wiki.build.com/ca_builds/1.2_Archive]1.2_Archive
I'm not sure what you mean by wikilinks, but the above should give you an idea on how to parse the string.
I'm having some difficulty understanding you question, but it seems like you just want to return the string after the last '/' character in the link? You can do this with reverse find.
return link[link.rfind('/') + 1:]
Hello I am having trouble with a xml file I am using. Now what happens is whenever i try to get the msg tag i get an error preventing me from accessing the data. Here is the code I am writing so far.
from xml.dom import minidom
import smtplib
from email.mime.text import MIMEText
from datetime import datetime
def xml_data ():
f = open('C:\opidea_2.xml', 'r')
data = f.read()
f.close()
dom = minidom.parseString(data)
ic = (dom.getElementsByTagName('logentry'))
dom = None
content = ''
for num in ic:
xmlDate = num.getElementsByTagName('date')[0].firstChild.nodeValue
content += xmlDate + '\n '
xmlMsg = num.getElementsByTagName('msg')
if xmlMsg !='' and len(xmlMsg) > 0:
xmlMsgc = xmlMsg[0].firstChild.nodeValue
content += " Comments: \n " + str(xmlMsg) + '\n\n'
else:
xmlMsgc = "No comment made."
content += xmlMsgc
print content
if __name__ == "__main__":
xml_data ()
Here is part of the xml if it helps.
<log>
<logentry
revision="33185">
<author>glv</author>
<date>2012-08-06T21:01:52.494219Z</date>
<paths>
<path
kind="file"
action="M">/branches/Patch_4_2_0_Branch/text.xml</path>
<path
kind="dir"
action="M">/branches/Patch_4_2_0_Branch</path>
</paths>
<msg>PATCH_BRANCH:N/A
BUG_NUMBER:N/A
FEATURE_AFFECTED:N/A
OVERVIEW:N/A
Adding the SVN log size requirement to the branch
</msg>
</logentry>
</log>
Now when i use xmlMsg = num.getElementsByTagName('msg')[0].toxml() I can get the code to work, I just have to do a lot of replacing and I rather not have to do that. Also I have date working using xmlDate = num.getElementsByTagName('date')[0].firstChild.nodeValue.
Is there something I am missing or doing wrong? Also here is the traceback.
Traceback (most recent call last):
File "C:\python\src\SVN_Email_copy.py", line 141, in <module>
xml_data ()
File "C:python\src\SVN_Email_copy.py", line 94, in xml_data
xmlMsg = num.getElementsByTagName('msg').firstChild.nodeValue
AttributeError: 'NodeList' object has no attribute 'firstChild'
I suggest a different approach. Below is a program that does what you want (I think...). It uses the ElementTree API instead of minidom. This simplifies things quite a bit.
You have posted several related questions concerning parsing of an XML file using minidom. I really think you should look into ElementTree (and for even more advanced stuff, check out ElementTree's "superset", lxml). Both these APIs are much easier to work with than minidom.
import xml.etree.ElementTree as ET
def xml_data():
root = ET.parse("opidea_2.xml")
logentries = root.findall("logentry")
content = ""
for logentry in logentries:
date = logentry.find("date").text
content += date + '\n '
msg = logentry.find("msg")
if msg is not None:
content += " Comments: \n " + msg.text + '\n\n'
else:
content += "No comment made."
print content
if __name__ == "__main__":
xml_data()
Output when using your XML sample (you may want to work a bit more on the exact layout):
2012-08-06T21:01:52.494219Z
Comments:
PATCH_BRANCH:N/A
BUG_NUMBER:N/A
FEATURE_AFFECTED:N/A
OVERVIEW:N/A
Adding the SVN log size requirement to the branch
I was doing the code wrong it seems. Here is how i was able to solve it.
if len(xmlMsg) > 0 and xmlMsg[0].firstChild != None:
xmlMsgc = xmlMsg[0].firstChild.nodeValue
xmlMsgpbr = xmlMsgc.replace('\n', ' ')
xmlMsgf.append(xmlMsgpbr)
else:
xmlMsgf = "No comments made"
I never checked if first child had any value or not. That's what I was missing. the other answers helped well but this is how i was able to get it to work. Thank you guys.
myNodeList.item( 0)
maybe...
http://docs.python.org/library/xml.dom.html
use this... print "%s" %(num.getElementsByTagName('date')[0].firstChild.data)