class_xml.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import urllib.request
  2. from bs4 import BeautifulSoup
  3. from subprocess import run, PIPE
  4. import dicttoxml
  5. import json
  6. class XML:
  7. def __init__(self, **kwargs):
  8. """Provide one of the following parameters
  9. str=<a>...</a>
  10. url=http:// or file://
  11. data=BeautifulSoup object
  12. Optional parameters : lang=xml,html,json
  13. """
  14. # fetch data
  15. if 'data' in kwargs:
  16. self.data = kwargs['data']
  17. else:
  18. if 'str' in kwargs:
  19. source = kwargs['str']
  20. elif 'url' in kwargs:
  21. source = urllib.request.urlopen(kwargs['url']).read()
  22. else:
  23. raise ValueError('XML : either str, url, or data must be provided')
  24. # parse data
  25. lang = kwargs.get('lang', 'xml')
  26. if lang == 'json':
  27. source = dicttoxml.dicttoxml(json.loads(source.decode()), attr_type=False, custom_root='json')
  28. parser = "lxml-xml"
  29. elif lang == "xml":
  30. parser = "lxml-xml"
  31. elif lang == "html":
  32. parser = "lxml"
  33. else:
  34. raise NotImplementedError('lang='+lang+" not supported")
  35. self.data = BeautifulSoup(source, parser)
  36. def xquery(self, query):
  37. """A binding with Saxon (at least for the moment)
  38. returns a list of XML objects
  39. """
  40. process = run(["java", "-cp", "saxon.jar:tagsoup-1.2.jar",
  41. "net.sf.saxon.Query", "!omit-xml-declaration=yes", "-qs:" + query, "-s:-"],
  42. stdout=PIPE, input=str(self.data).encode(), check=True)
  43. return [XML(data=obj) for obj in XML(str=b"<xquery>" + process.stdout + b"</xquery>").data.xquery.children]
  44. def __str__(self):
  45. return self.data.prettify()