import urllib.request from bs4 import BeautifulSoup from subprocess import run, PIPE import dicttoxml import json class XML: def __init__(self, **kwargs): """Provide one of the following parameters str=... url=http:// or file:// data=BeautifulSoup object Optional parameters : lang=xml,html,json """ # fetch data if 'data' in kwargs: self.data = kwargs['data'] else: if 'str' in kwargs: source = kwargs['str'] elif 'url' in kwargs: source = urllib.request.urlopen(kwargs['url']).read() else: raise ValueError('XML : either str, url, or data must be provided') # parse data lang = kwargs.get('lang', 'xml') if lang == 'json': source = dicttoxml.dicttoxml(json.loads(source.decode()), attr_type=False, custom_root='json') parser = "lxml-xml" elif lang == "xml": parser = "lxml-xml" elif lang == "html": parser = "lxml" else: raise NotImplementedError('lang='+lang+" not supported") self.data = BeautifulSoup(source, parser) def xquery(self, query): """A binding with Saxon (at least for the moment) returns a list of XML objects """ process = run(["java", "-cp", "saxon.jar:tagsoup-1.2.jar", "net.sf.saxon.Query", "!omit-xml-declaration=yes", "-qs:" + query, "-s:-"], stdout=PIPE, input=str(self.data).encode(), check=True) return [XML(data=obj) for obj in XML(str=b"" + process.stdout + b"").data.xquery.children] def __str__(self): return self.data.prettify()