BeautifulSoup's extract partly failed - python

I download the source html code from https://down.ali213.net/pcgame/all/falcom-0-0-0-new-pic-1 as the file htmlstr1.html
Then I use the following python code to handle the htmlstr1.html, in the code, I am trying to delete all the {a} tags and {span} tags with the bs4 method extract. However I notice there are still a few {a} tags and {span} tags in the output file htmlstr_extracted1.html
I dont't know where goes wrong, anybody can help?
# 抽取html的特征
from bs4 import BeautifulSoup
from bs4 import NavigableString, Tag, Comment
import re
import hashlib
def read_file(_path):
with open(_path, "r", encoding="utf-8") as f:
_html_str = f.read()
return _html_str
def write_file(_path, _str):
with open(_path, "w", encoding="utf-8") as f:
f.write(_str)
def md5_str(_str):
md = hashlib.md5(_str.encode())
return md.hexdigest()
class HtmlFeatureExtractor:
def __init__(self, _html_str):
self.del_col = []
_html_str = self.replace_spaces_with_space(_html_str)
_html_str = self.remove_all_br(_html_str)
self.soup = BeautifulSoup(_html_str, 'lxml')
def inline_block_check(self, parent_obj):
if isinstance(parent_obj, Tag):
if parent_obj.name in ['p', 'a', 'span', 'h3', 'b', 'h1', 'strong', 'font', 'h1', 'h4', 'li'] and len(
parent_obj.parent.contents) == 1:
self.collect_del_element(parent_obj)
self.inline_block_check(parent_obj.parent)
def collect_del_element(self, element):
if element not in self.del_col:
self.del_col.append(element)
def remove_all_br(self, _str):
_str = _str.replace("\r\n", "")
_str = _str.replace("\r", "")
_str = _str.replace("\n", "")
return _str
def replace_spaces_with_space(self, _str):
pattern = re.compile(r'\s+', re.I | re.S)
_str = pattern.sub(' ', _str)
return _str
def not_others_child(self, element):
for i in self.del_col:
if element is i:
continue
if isinstance(i, Tag) or isinstance(i, NavigableString):
try:
for ii in i.descendants:
if element is ii:
return False
except Exception as err:
continue
return True
def extract(self):
for x in self.soup.descendants:
if isinstance(x, NavigableString):
self.collect_del_element(x)
if isinstance(x.parent, Tag):
if len(x.parent.contents) == 1 and x.parent.name not in ['[document]', 'html', 'body']: # 对inline部分再搞一层,并且子集唯一
self.collect_del_element(x.parent) # 第一层,直接删
self.inline_block_check(x.parent.parent) # 再往后,递归
else:
if x.name in ['script', 'meta', 'link', 'style', 'img', 'a', 'input', 'iframe', 'form', 'p', 'li', 'span']: # 还要包含a标签, !!!写一个去重判断
self.collect_del_element(x) # 后删
if x.name in ['img', 'a', 'iframe']:
self.inline_block_check(x.parent)
self.del_col = [i for i in self.del_col if self.not_others_child(i)]
for y in self.del_col:
if isinstance(y, Tag) or isinstance(y, NavigableString):
y.extract()
# z = y.extract()
# print("-" * 30)
# print(z)
_feature_str = str(self.soup)
_feature_str = self.replace_spaces_with_space(_feature_str)
_feature_str = self.remove_all_br(_feature_str)
write_file("./htmlstr_extracted%d.html" % gg, _feature_str) # todo 调试用
return md5_str(_feature_str)
if __name__ == '__main__':
g_num = [1]
for gg in g_num:
html_str = read_file("./htmlstr%d.html" % gg)
feature_extractor = HtmlFeatureExtractor(html_str)
feature_str = feature_extractor.extract()
print("成功: %s" % feature_str)
my purpose is to extract the feature of a html source, I wish to get the same hash value if the webpage use the same template. the following two pages use the same template, so they should have the same hash value calculate by the above code.
https://down.ali213.net/pcgame/all/falcom-0-0-0-new-pic-1
https://down.ali213.net/pcgame/all/rockstar-0-0-0-new-pic-1

Related

Cant access function in OOP python

For some reason, in my fruit scraper, i cannot access anything from listify function.
I'am getting an error, for exmaple: NameError: name 'family' is not defined.
And i cant figure out what is wrong with my code - is my function is bad, or i'am doing something wrong with class ?
import requests
import json
import random
import pickle
class FruitScraper():
def __init__(self):
self.name = []
self.id = []
self.family = []
self.genus = []
self.order = []
self.carbohydrates = []
self.protein = []
self.fat = []
self.calories = []
self.sugar = []
def scrape_all_fruits(self):
data_list = []
try:
for ID in range(1, 10):
url = f'https://www.fruityvice.com/api/fruit/{ID}'
response = requests.get(url)
data = response.json()
data_list.append(data)
except:
pass
return data_list
def listify(self, stats):
alist = json.dumps(self.scrape_all_fruits())
jsonSTr = json.loads(alist)
for i in jsonSTr:
try:
self.name.append(i['name'])
self.id.append(i['id'])
self.family.append(i['family'])
self.genus.append(i['genus'])
self.order.append(i['order'])
self.carbohydrates.append(i['nutritions']['carbohydrates'])
self.protein.append(i['nutritions']['protein'])
self.fat.append(i['nutritions']['fat'])
self.calories.append(i['nutritions']['calories'])
self.sugar.append(i['nutritions']['sugar'])
except:
pass
return stats
def get_summary(self):
for i in self.listify(zip(self.fat, self.protein, self.calories, self.sugar, self.carbohydrates, self.name)):
nutr_stats = f'\nNutrients maximum statistics:\nFat: {max(self.fat)}\nProtein: {max(self.protein)}\nCarbohydrates: {max(self.carbohydrates)}\nCalories: {max(self.calories)}\nSugar: {max(self.sugar)}' \
f'\nNutrients minimum statistics:\nFat: {min(self.fat)}\nProtein: {min(self.protein)}\nCarbohydrates: {min(self.carbohydrates)}\nCalories: {min(self.calories)}\nSugar: {min(self.sugar)}' \
f'\nTotal fruits scraped: {len(self.name)}'
return nutr_stats
Scraped_info = FruitScraper().scrape_all_fruits()
Listified_info = FruitScraper().listify(family)
Fruits_statistics = FruitScraper().get_summary()
It's my first time doing OOP.
Please consider changing this
Scraped_info = FruitScraper().scrape_all_fruits()
Listified_info = FruitScraper().listify(family)
Fruits_statistics = FruitScraper().get_summary()
to
myScraper = FruitScraper()
Scraped_info = myScraper.scrape_all_fruits()
myScraper.listify()
Fruits_statistics = myScraper.get_summary()
Otherwise you create three different objects of this class and discard them with all their attributes after running the individual method once.
This might also be critical to define family in this line of the code:
Listified_info = myScraper.listify(family)
But I can't see how you intended to use the parameter stats in your method listify(). It is just received and returned. I suggest that you change:
def listify(self, stats):
to
def listify(self):
and remove
return stats
If you want to get those lists inside the object of this class returned by listify(), you may do the following (but this is not OOP way of doing things):
import requests
import json
import copy
class FruitScraper():
def __init__(self):
self.name = []
self.id = []
self.family = []
self.genus = []
self.order = []
self.carbohydrates = []
self.protein = []
self.fat = []
self.calories = []
self.sugar = []
def collect_all_lists(self):
self.allLists = dict('name': self.name, 'id': self.id, 'family': self.family, 'genus': self.genus, 'order': self.order, 'carbohydrates': self.carbohydrates, 'protein': self.protein, 'fat': self.fat, 'calories': self.calories, 'sugar': self.sugar)
def scrape_all_fruits(self):
data_list = []
try:
for ID in range(1, 10):
url = f'https://www.fruityvice.com/api/fruit/{ID}'
response = requests.get(url)
data = response.json()
data_list.append(data)
except:
pass
return data_list
def listify(self):
alist = json.dumps(self.scrape_all_fruits())
jsonSTr = json.loads(alist)
for i in jsonSTr:
try:
self.name.append(i['name'])
self.id.append(i['id'])
self.family.append(i['family'])
self.genus.append(i['genus'])
self.order.append(i['order'])
self.carbohydrates.append(i['nutritions']['carbohydrates'])
self.protein.append(i['nutritions']['protein'])
self.fat.append(i['nutritions']['fat'])
self.calories.append(i['nutritions']['calories'])
self.sugar.append(i['nutritions']['sugar'])
except:
pass
self.collect_all_lists()
return copy.deepcopy(self.allLists)
def get_summary(self):
for i in self.listify(zip(self.fat, self.protein, self.calories, self.sugar, self.carbohydrates, self.name)):
nutr_stats = f'\nNutrients maximum statistics:\nFat: {max(self.fat)}\nProtein: {max(self.protein)}\nCarbohydrates: {max(self.carbohydrates)}\nCalories: {max(self.calories)}\nSugar: {max(self.sugar)}' \
f'\nNutrients minimum statistics:\nFat: {min(self.fat)}\nProtein: {min(self.protein)}\nCarbohydrates: {min(self.carbohydrates)}\nCalories: {min(self.calories)}\nSugar: {min(self.sugar)}' \
f'\nTotal fruits scraped: {len(self.name)}'
return nutr_stats
myScraper = FruitScraper()
Scraped_info = myScraper.scrape_all_fruits()
Listified_info = myScraper.listify()
Fruits_statistics = myScraper.get_summary()

List comes back as empty when retrieveing data from website ; Python

I am trying to parse data from a website by inserting the data into a list, but the list comes back empty.
url =("http://www.releasechimps.org/resources/publication/whos-there-md- anderson")
http = urllib3.PoolManager()
r = http.request('Get',url)
soup = BeautifulSoup(r.data,"html.parser")
#print(r.data)
loop = re.findall(r'<td>(.*?)</td>',str(r.data))
#print(str(loop))
newLoop = str(loop)
#print(newLoop)
for x in range(1229):
if "\\n\\t\\t\\t\\t" in loop[x]:
loop[x] = loop[x].replace("\\n\\t\\t\\t\\t","")
list0_v2.append(str(loop[x]))
print(loop[x])
print(str(list0_v2))
Edit: Didn't really have anything else going on, so I made your data format into a nice list of dictionaries. There's a weird <td height="26"> on monkey 111, so I had to change the regex slightly.
Hope this helps you, I did it cause I care about the monkeys man.
import html
import re
import urllib.request
list0_v2 = []
final_list = []
url = "http://www.releasechimps.org/resources/publication/whos-there-md-anderson"
data = urllib.request.urlopen(url).read()
loop = re.findall(r'<td.*?>(.*?)</td>', str(data))
for item in loop:
if "\\n\\t\\t\\t\\t" or "em>" in item:
item = item.replace("\\n\\t\\t\\t\\t", "").replace("<em>", "")\
.replace("</em>", "")
if " " == item:
continue
list0_v2.append(item)
n = 1
while len(list0_v2) != 0:
form = {"n":0, "name":"", "id":"", "gender":"", "birthdate":"", "notes":""}
try:
if list0_v2[5][-1] == '.':
numb, name, ids, gender, birthdate, notes = list0_v2[0:6]
form["notes"] = notes
del(list0_v2[0:6])
else:
raise Exception('foo')
except:
numb, name, ids, gender, birthdate = list0_v2[0:5]
del(list0_v2[0:5])
form["n"] = int(numb)
form["name"] = html.unescape(name)
form["id"] = ids
form["gender"] = gender
form["birthdate"] = birthdate
final_list.append(form)
n += 1
for li in final_list:
print("{:3} {:10} {:10} {:3} {:10} {}".format(li["n"], li["name"], li["id"],\
li["gender"], li["birthdate"], li["notes"]))

Add another use case to a python code

I have the following Python class.
from body_parser import Extractor
import re
class FEOProcessor(object):
CHECKS = [
('Standard JavaScript Inlining Optimization', ('EMBED_JAVASCRIPT',), 'check_js_inlining'),
('HTML5 Advanced Cache', ('JAVASCRIPT_HTML5_CACHE', 'CSS_HTML5_CACHE'), 'check_html5_advanced_cache'),
('Cookieless Resource Domain', ('RENAME_JAVASCRIPT', 'RENAME_CSS'), 'check_cookieless_resource_domain'),
('Minificatiopn of JS', ('MINIFY_JAVASCRIPT',), 'check_js_minifaction'),
('File Versioning', ('RENAME_JAVASCRIPT', 'RENAME_IMAGE', 'RENAME_CSS'), 'check_file_versioning'),
('Small Image Embedding', ('EMBED_IMAGE',), 'check_small_image_embedding'),
('Responsive Image Loading', ('RESPONSIVE_IMAGES',), 'check_responsive_image_loading'),
('Asynchronous JS and CSS Loading', ('ASYNC_JAVASCRIPT',), 'check_async_js_and_css_loading'),
('JS Pre-Execution', ('PRE_EXECUTE_JAVASCRIPT',), 'check_js_pre_execution'),
('EDGESTART', ('EDGESTART',), 'check_edgestart'),
('Invoke Click OnTouch', ('BlzFastClick',), 'check_click'),
('Cellular Connection Keep-Alive', ('blzEnableMobileHeartbeat',), 'check_cell'),
]
def __init__(self):
self.parser = Extractor()
self.result = dict((k, False) for k,_,_ in self.CHECKS)
for _, keys, name in CHECKS:
locals()[name] = lambda self, result, _keys=keys: all(result.get(k, 0)>0 for k in _keys)
def process_feo_debug_output(self, analysis_id, url):
feed = self.parser.start_parser(analysis_id, url, True)
result = self.get_feo_tags(feed)
for name, _, func in self.CHECKS:
self.result[name] = (False, True)[getattr(self,func)(result)]
return self.result
def get_feo_tags(self, feed):
result = {}
tag_list = re.findall(r'(?:TextTransApplied):\s*((?:(?:[A-Z]+(?:_[A-Z\d]+)+)?\(\d+\)\s*(?:,\s*|;))*)', str(feed))
for tag in tag_list:
for element in tag.split(","):
index = element.index('(')
if element[:index].strip():
result[element[:index].strip()] = (element.split("(")[1].rstrip(");"))
return result
def check_edgestart(self, result):
return 1 if 'EDGESTART' in result.keys() else 0
def check_click(self, result):
return 1 if 'BlzFastClick' in result.keys() else 0
def check_cell(self, result):
return 1 if 'blzEnableMobileHeartbeat' in result.keys() else 0
which returns me a list of True and False values based on the checks. I want an additional check based on the feed which is input to the get_feo_tags method.
The another check that need to be incorporated is
for img in feed.find_all('img', attrs={'data-blzsrc': True, 'src': lambda x: 'data' not in x}):
#append to the result dict {On Demand Image Loading: True}
How to do this in the current settings.
If soup is the same as feed, you can try this:
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from body_parser import Extractor
import re
class FEOProcessor(object):
CHECKS = [
('Standard JavaScript Inlining Optimization', ('EMBED_JAVASCRIPT',), 'check_js_inlining'),
('HTML5 Advanced Cache', ('JAVASCRIPT_HTML5_CACHE', 'CSS_HTML5_CACHE'), 'check_html5_advanced_cache'),
('Cookieless Resource Domain', ('RENAME_JAVASCRIPT', 'RENAME_CSS'), 'check_cookieless_resource_domain'),
('Minificatiopn of JS', ('MINIFY_JAVASCRIPT',), 'check_js_minifaction'),
('File Versioning', ('RENAME_JAVASCRIPT', 'RENAME_IMAGE', 'RENAME_CSS'), 'check_file_versioning'),
('Small Image Embedding', ('EMBED_IMAGE',), 'check_small_image_embedding'),
('Responsive Image Loading', ('RESPONSIVE_IMAGES',), 'check_responsive_image_loading'),
('Asynchronous JS and CSS Loading', ('ASYNC_JAVASCRIPT',), 'check_async_js_and_css_loading'),
('JS Pre-Execution', ('PRE_EXECUTE_JAVASCRIPT',), 'check_js_pre_execution'),
('EDGESTART', ('EDGESTART',), 'check_edgestart'),
('Invoke Click OnTouch', ('BlzFastClick',), 'check_click'),
('Cellular Connection Keep-Alive', ('blzEnableMobileHeartbeat',), 'check_cell'),
('On Demand Image Loading', ('img',), 'check_img'),
]
def __init__(self):
self.parser = Extractor()
self.result = dict((k, False) for k,_,_ in self.CHECKS)
for _, keys, name in CHECKS:
locals()[name] = lambda self, result, _keys=keys: all(result.get(k, 0)>0 for k in _keys)
def process_feo_debug_output(self, analysis_id, url):
feed = self.parser.start_parser(analysis_id, url, True)
# this works only if feed is the same as soup as you said
result = self.get_feo_tags(feed)
for name, _, func in self.CHECKS:
if name == 'On Demand Image Loading':
self.result[name] = (False, True)[getattr(self,func)(feed)]
else:
self.result[name] = (False, True)[getattr(self,func)(result)]
return self.result
def get_feo_tags(self, feed):
result = {}
tag_list = re.findall(r'(?:TextTransApplied):\s*((?:(?:[A-Z]+(?:_[A-Z\d]+)+)?\(\d+\)\s*(?:,\s*|;))*)', str(feed))
for tag in tag_list:
for element in tag.split(","):
index = element.index('(')
if element[:index].strip():
result[element[:index].strip()] = (element.split("(")[1].rstrip(");"))
return result
def check_edgestart(self, result):
return 1 if 'EDGESTART' in result.keys() else 0
def check_click(self, result):
return 1 if 'BlzFastClick' in result.keys() else 0
def check_cell(self, result):
return 1 if 'blzEnableMobileHeartbeat' in result.keys() else 0
def check_img(self, feed):
return 1 if feed.find_all('img', attrs={'data-blzsrc': True, 'src': lambda x: 'data' not in x}) else 0

Generating Python soaplib stubs from WSDL

I'd like to generate a stub SOAP web service class using the Python soaplib module, based on an existing WSDL. The idea is to generate a mock for a third party web service.
Does any such code generator exist, or must we write our own?
Martin
Okay, I had a go at hacking my wsdl2interface (http://pypi.python.org/pypi/wsdl2interface) script to output soaplib code. I think I have something that works, though it's not pretty or especially well tested.
I'll paste it here for the record. I could be persuaded to release it if someone needs it, though it's not exactly my best code. Note that it uses Suds' WSDL parser to generate soaplib code, which is a bit strange in itself.
Run like this:
$ wsdl2soaplib <url or filename of WSDL> > wsdl.py
The code (you'll need suds in your path, ideally in a virtualenv):
from StringIO import StringIO
import os.path
import sys
import textwrap
import keyword
import re
import suds.client
VALID_IDENTIFIER_RE = re.compile(r"[_A-Za-z][_A-Za-z1-9]*")
VALID_IDENTIFIER_FIRST_LETTER_RE = re.compile(r"[_A-Za-z]")
VALID_IDENTIFIER_SUBSEQUENT_LETTER_RE = re.compile(r"[_A-Za-z1-9]")
HEADER = '''\
"""SOAP web services generated from:
%(wsdl)s.
"""
from soaplib.serializers.primitive import (
String, Integer, Float, Double, DateTime, Bolean, Null, Array, Map, Any
)
from soaplib.serializers.clazz import ClassSerializer
from soaplib.service import SoapServiceBase
from soaplib.service import soapmethod
'''
INTERFACE = '''\
class %(name)s(%(bases)s):
"""%(docstring)s"""
'''
SERVICE_INTERFACE_DOCSTRING = '''\
SOAP service ``%(serviceName)s`` with target namespace %(tns)s.
'''
TYPE_INTERFACE_DOCSTRING = '''\
SOAP %(type)s ``{%(namespace)s}%(name)s``
'''
TYPE_MAP = '''\
WSDL_TYPES = {
%(items)s
}
'''
SOAPMETHOD = ''' #soapmethod(%(args)s, _returns=%(response)s)'''
METHOD = ''' def %(name)s(self, %(args)s):'''
METHOD_DOCSTRING = '''\
"""Parameters:
%(args)s
Returns: %(response)s
"""
'''
STANDARD_TYPE_NAMESPACES = [
'http://schemas.xmlsoap.org/soap/encoding/',
'http://schemas.xmlsoap.org/wsdl/',
'http://www.w3.org/2001/XMLSchema'
]
SCHEMA_TYPE_MAPPING = {
None: '%(typeName)s',
'None': 'None',
'boolean': 'Boolean',
'string': 'String',
'long': 'Integer',
'int': 'Integer',
'short': 'Integer',
'byte': 'Integer',
'unsignedLong': 'Integer',
'unsignedInt': 'Integer',
'unsignedShort': 'Integer',
'unsignedByte': 'Integer',
'positiveInteger': 'Integer',
'nonPositiveInteger': 'Integer',
'negativeInteger': 'Integer',
'nonNegativeInteger': 'Integer',
'float': 'Float',
'double': 'Float',
'decimal': 'Decimal',
'dateTime': 'DateTime',
'date': 'DateTime',
'anyURI': 'String',
'token': 'String',
'normalizedString': 'String',
'base64Binary': 'String',
'hexBinary': 'String',
}
def formatDocstring(text, indent=4, colwidth=78):
width = colwidth - indent
joiner = '\n' + ' ' * indent
return joiner.join(textwrap.wrap(text, width) + [''])
def typeName(type, sd):
resolved = type.resolve()
return resolved.name or ''
def schemaTypeName(type, sd, deps=None):
resolved = type.resolve()
name = resolved.name or ''
schemaType = SCHEMA_TYPE_MAPPING.get(name)
if schemaType is None: # not a standard type
# user default
schemaType = SCHEMA_TYPE_MAPPING[None]
# possibly save dependency link
if deps is not None:
deps.append(unicode(name))
required = type.required()
schemaType = schemaType % dict(typeName=name, required=required)
if type.unbounded():
schemaType = "Array(%s)" % schemaType
return schemaType
def normalizeIdentifier(identifier):
if not VALID_IDENTIFIER_RE.match(identifier):
newIdentifierLetters = []
firstLetter = True
for letter in identifier:
if firstLetter:
if VALID_IDENTIFIER_FIRST_LETTER_RE.match(letter):
newIdentifierLetters.append(letter)
else:
newIdentifierLetters.append('_')
firstLetter = False
else:
if VALID_IDENTIFIER_SUBSEQUENT_LETTER_RE.match(letter):
newIdentifierLetters.append(letter)
else:
newIdentifierLetters.append('_')
identifier = ''.join(newIdentifierLetters)
if keyword.iskeyword(identifier):
identifier = identifier + '_'
return identifier
def generate(client, url=None, standardTypeNamespaces=STANDARD_TYPE_NAMESPACES, removeInputOutputMesssages=True):
"""Given a WSDL URL, return a file that could become your interfaces.py
"""
printed = [] # sequence of type name -> string
for sd in client.sd:
serviceOut = StringIO()
print >>serviceOut, HEADER % dict(
wsdl=url,
)
printed.append(('', serviceOut.getvalue(),))
# Types
typeMap = {}
typeSeq = []
typeDeps = {}
typeAttributes = {}
typesPrinted = []
for type_ in sd.types:
typeOut = StringIO()
resolved = type_[0].resolve()
namespaceURL = resolved.namespace()[1]
if namespaceURL not in standardTypeNamespaces:
if resolved.enum():
typeDescription = "enumeration"
else:
typeDescription = "complex type"
# Look for basess
interfaceBases = []
if resolved.extension():
def find(t):
for c in t.rawchildren:
if c.extension():
find(c)
if c.ref is not None:
interfaceBases.append(c.ref[0])
find(resolved)
if not interfaceBases:
interfaceBases = ['ClassSerializer']
rawTypeName = typeName(type_[0], sd)
typeInterfaceName = normalizeIdentifier(rawTypeName)
typeMap[rawTypeName] = typeInterfaceName
typeSeq.append((rawTypeName, typeInterfaceName,))
typeAttributes[rawTypeName] = {}
print >>typeOut, INTERFACE % dict(
name=normalizeIdentifier(typeInterfaceName),
bases=', '.join(interfaceBases),
docstring=formatDocstring(TYPE_INTERFACE_DOCSTRING % dict(
type=typeDescription,
name=rawTypeName,
namespace=namespaceURL,
)
)
)
print >>typeOut, " class types:"
if resolved.enum():
for attr in type_[0].children():
name = attr[0].name.replace(' ', '_')
print >>typeOut, " %s = String # XXX: Enumeration value" % name
else:
for attr in type_[0].children():
name = attr[0].name.replace(' ', '_')
attrTypeName = typeName(attr[0], sd)
typeAttributes[rawTypeName][name] = attrTypeName
schemaType = schemaTypeName(attr[0], sd, deps=typeDeps.setdefault(unicode(rawTypeName), []))
print >>typeOut, " %s = %s" % (normalizeIdentifier(name), schemaType,)
print >>typeOut
typesPrinted.append((rawTypeName, typeOut.getvalue(),))
serviceInterfaceOut = StringIO()
# Main service interface
print >>serviceInterfaceOut, INTERFACE % dict(
name=normalizeIdentifier(sd.service.name),
bases=u"SoapServiceBase",
docstring=formatDocstring(SERVICE_INTERFACE_DOCSTRING % dict(
serviceName=sd.service.name,
tns=sd.wsdl.tns[1],
)
)
)
methods = {} # name -> (response type, list of parameters,)
for p in sd.ports:
for m in p[1]:
methodName = m[0]
methodArgs = m[1]
if methodName not in methods:
methodDef = p[0].method(methodName)
# XXX: This is discards the namespace part
if methodDef.soap.output.body.wrapped:
inputMessage = methodDef.soap.input.body.parts[0].element[0]
outputMessage = methodDef.soap.output.body.parts[0].element[0]
if outputMessage in typeAttributes:
if len(typeAttributes[outputMessage]) > 0:
response = typeAttributes[outputMessage].values()[0]
else:
response = "None"
else:
response = outputMessage
# Remove types used as input/output messages
if removeInputOutputMesssages:
remove = False
for idx, (t, x) in enumerate(typesPrinted):
if t == inputMessage:
remove = True
break
if remove:
del typesPrinted[idx]
if inputMessage in typeMap:
del typeMap[inputMessage]
remove = False
for idx, (t, x) in enumerate(typesPrinted):
if t == outputMessage:
remove = True
break
if remove:
del typesPrinted[idx]
if outputMessage in typeMap:
del typeMap[outputMessage]
else:
response = methodDef.soap.output.body.parts[0].element[0]
methods[methodName] = (response, methodArgs,)
for methodName in sorted(methods):
methodArgNames = [m[0] for m in methods[methodName][1]]
methodReturnType = methods[methodName][0]
methodArgDetails = []
methodArgSpecs = []
for m in methods[methodName][1]:
argDetail = m[1]
# for docstring
methodModifierParts = []
if not argDetail.required():
methodModifierParts.append('optional')
if argDetail.nillable:
methodModifierParts.append('may be None')
methodModifiers = ""
if methodModifierParts:
methodModifiers = ' (%s)' % ', '.join(methodModifierParts)
argTypeName = typeName(argDetail, sd)
methodSpec = "``%s`` -- %s%s" % (
argDetail.name,
argTypeName,
methodModifiers
)
methodArgDetails.append(methodSpec)
# for #soapmethod decorator
schemaType = schemaTypeName(argDetail, sd)
methodArgSpecs.append(schemaType)
# TODO: Probably not aware of array return types
if methodReturnType not in typeMap and methodReturnType in SCHEMA_TYPE_MAPPING:
methodReturnType = SCHEMA_TYPE_MAPPING[methodReturnType]
print >>serviceInterfaceOut, SOAPMETHOD % dict(
args=', '.join(methodArgSpecs),
response=methodReturnType,
)
print >>serviceInterfaceOut, METHOD % dict(
name=normalizeIdentifier(methodName),
args=', '.join(methodArgNames),
)
print >>serviceInterfaceOut, METHOD_DOCSTRING % dict(
args='\n '.join(methodArgDetails),
response=methodReturnType,
)
print >>serviceInterfaceOut
# Sort list of complex types based on internal dependencies
def sortDeps(printed):
printed = list(reversed(printed))
queue = [item for item in printed if len(typeDeps.get(unicode(item[0]), [])) == 0]
satisfied = set(queue)
remaining = [item for item in printed if item not in queue]
sortedPrinted = []
while queue:
item = queue.pop()
itemTypeName = unicode(item[0])
sortedPrinted.append(item)
satisfied.add(itemTypeName)
for item in remaining:
remainingItemTypeName = unicode(item[0])
depsList = typeDeps.get(remainingItemTypeName, [])
remainingDeps = []
for dep in depsList:
if dep not in satisfied:
remainingDeps.append(dep)
typeDeps[remainingItemTypeName] = remainingDeps
if len(remainingDeps) == 0:
queue.append(item)
remaining.remove(item)
return sortedPrinted
typesPrinted = sortDeps(typesPrinted)
# Print everything
printed.extend(typesPrinted)
printed.append((sd.service.name, serviceInterfaceOut.getvalue(),))
typeMapOut = StringIO()
print >>typeMapOut, TYPE_MAP % dict(
items=',\n'.join([" '%s': %s" % k for k in typeSeq if k[0] in typeMap])
)
print >>typeMapOut
printed.append(('', typeMapOut.getvalue(),))
return '\n'.join([v[1] for v in printed])
def main():
if len(sys.argv) < 2:
print "Usage: %s <url>" % sys.argv[0]
print "The output will be printed to the console"
return
if not '://' in sys.argv[1]:
sys.argv[1] = 'file://' + os.path.abspath(sys.argv[1])
client = suds.client.Client(sys.argv[1])
print generate(client, sys.argv[1])
if __name__ == '__main__':
main()
I have just created a github repository where I'm improving on optilude's script to make it work with soaplib2.0 and more. The link is https://github.com/fvieira/wsdl2soaplib.

SVG Glyphs in Pyqt

How do I render glyphs in pyqt using the svggraphicsItem?
Recently I found that svg files generated by Cairo do not plot properly in pyqt. The error comes from the use of glyphs which seem not to be shown in pyqt (this might be wrong but I couldn't find any way of getting glyphs to render).
I ended up writing a set of functions that will convert the glyphs to svg paths so the file will render normally.
These could still use some improvements for rendering color and other style elements (which are locked in the functions that I wrote).
These functions will need to be embedded in a class or have self removed to be used elsewhere.
I just wanted people to have these so they wouldn't have to search high and low like I did to find a way to render glyphs in pyqt.
Hope for the best,
Kyle
def convertSVG(self, file):
dom = self._getsvgdom(file)
print dom
self._switchGlyphsForPaths(dom)
self._commitSVG(file, dom)
def _commitSVG(self, file, dom):
f = open(file, 'w')
dom.writexml(f)
f.close()
def _getsvgdom(self, file):
print 'getting DOM model'
import xml.dom
import xml.dom.minidom as mini
f = open(file, 'r')
svg = f.read()
f.close()
dom = mini.parseString(svg)
return dom
def _getGlyphPaths(self, dom):
symbols = dom.getElementsByTagName('symbol')
glyphPaths = {}
for s in symbols:
pathNode = [p for p in s.childNodes if 'tagName' in dir(p) and p.tagName == 'path']
glyphPaths[s.getAttribute('id')] = pathNode[0].getAttribute('d')
return glyphPaths
def _switchGlyphsForPaths(self, dom):
glyphs = self._getGlyphPaths(dom)
use = self._getUseTags(dom)
for glyph in glyphs.keys():
print glyph
nl = self.makeNewList(glyphs[glyph].split(' '))
u = self._matchUseGlyphs(use, glyph)
for u2 in u:
print u2, 'brefore'
self._convertUseToPath(u2, nl)
print u2, 'after'
def _getUseTags(self, dom):
return dom.getElementsByTagName('use')
def _matchUseGlyphs(self, use, glyph):
matches = []
for i in use:
print i.getAttribute('xlink:href')
if i.getAttribute('xlink:href') == '#'+glyph:
matches.append(i)
print matches
return matches
def _convertUseToPath(self, use, strokeD):
## strokeD is a list of lists of strokes to make the glyph
newD = self.nltostring(self.resetStrokeD(strokeD, use.getAttribute('x'), use.getAttribute('y')))
use.tagName = 'path'
use.removeAttribute('xlink:href')
use.removeAttribute('x')
use.removeAttribute('y')
use.setAttribute('style', 'fill: rgb(0%,0%,0%); stroke-width: 0.5; stroke-linecap: round; stroke-linejoin: round; stroke: rgb(0%,0%,0%); stroke-opacity: 1;stroke-miterlimit: 10; ')
use.setAttribute('d', newD)
def makeNewList(self, inList):
i = 0
nt = []
while i < len(inList):
start = i + self.listFind(inList[i:], ['M', 'L', 'C', 'Z'])
end = start + self.listFind(inList[start+1:], ['M', 'L', 'C', 'Z', '', ' '])
nt.append(inList[start:end+1])
i = end + 1
return nt
def listFind(self, x, query):
for i in range(len(x)):
if x[i] in query:
return i
return len(x)
def resetStrokeD(self, strokeD, x, y):
nsd = []
for i in strokeD:
nsd.append(self.resetXY(i, x, y))
return nsd
def resetXY(self, nl, x, y): # convert a list of strokes to xy coords
nl2 = []
for i in range(len(nl)):
if i == 0:
nl2.append(nl[i])
elif i%2: # it's odd
nl2.append(float(nl[i]) + float(x))
elif not i%2: # it's even
nl2.append(float(nl[i]) + float(y))
else:
print i, nl[i], 'error'
return nl2
def nltostring(self, nl): # convert a colection of nl's to a string
col = []
for l in nl:
templ = []
for c in l:
templ.append(str(c))
templ = ' '.join(templ)
col.append(templ)
return ' '.join(col)

Categories