This module contains XFind and CSS selectors and related classes and functions.
A selector is a XIST tree traversal filter that traverses the complete XML tree and outputs those nodes specified by the selector. Selectors can be combined with various operations and form a language comparable to XPath but implemented as Python expressions.
class Selector(ll.xist.xsc.WalkFilter):
Base class for all tree traversal filters that visit the complete tree.
Whether a node gets output can be specified by overwriting the
matchpath method. Selectors can be combined with various operations
(see methods below).
def matchpath(self, *args, **kwargs):
selfdef filterpath(self, path):
selfdef __div__(self, other):
selfCreate a ChildCombinator with self as the left hand
selector and other as the right hand selector.
def __floordiv__(self, other):
selfCreate a DescendantCombinator with self as the left hand
selector and other as the right hand selector.
def __mul__(self, other):
selfCreate an AdjacentSiblingCombinator with self as the left
hand selector and other as the right hand selector.
def __pow__(self, other):
selfCreate a GeneralSiblingCombinator with self as the left
hand selector and other as the right hand selector.
def __and__(self, other):
selfCreate an AndCombinator from self and other.
def __or__(self, other):
selfCreate an OrCombinator from self and other.
def __invert__(self):
selfCreate a NotCombinator inverting self.
class IsInstanceSelector(Selector):
Selector that selects all nodes that are instances of the specified type.
You can either create an IsInstanceSelector object directly
or simply pass a class to a function that expects a walk filter (this class
will be automatically wrapped in an IsInstanceSelector):
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(html.a):
... print node.attrs.href, node.attrs.title
...
http://www.python.org/
http://www.python.org/#left%2Dhand%2Dnavigation
http://www.python.org/#content%2Dbody
http://www.python.org/search
http://www.python.org/about/ About The Python Language
http://www.python.org/news/ Major Happenings Within the Python Community
http://www.python.org/doc/ Tutorials, Library Reference, C API
http://www.python.org/download/ Start Running Python Under Windows, Mac, Linux and Others
...def __init__(self, *types):
selfdef matchpath(self, path):
selfdef __or__(self, other):
selfdef __getitem__(self, index):
selfReturn an nthoftype selector that uses index as the
index and self.types as the types.
def __str__(self):
selfclass hasname(Selector):
Selector that selects all nodes that have a specified Python name (which only selects elements, processing instructions and entities). Also a namespace name can be specified as a second argument, which will only select elements from the specified namespace:
>>> from ll.xist import parsers, xfind
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(xfind.hasname("img")):
... print node.bytes()
...
<img border="0" src="http://www.python.org/images/python-logo.gif" alt="homepage" id="logo" />
<img border="0" id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" />
<img border="0" id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" />
<img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />def __init__(self, name, xmlns=None):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass hasname_xml(Selector):
hasname_xml works similar to hasname except that the
specified name is treated as the XML name, not the Python name.
def __init__(self, name, xmlns=None):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass IsSelector(Selector):
Selector that selects one specific node in the tree. This can be combined
with other selectors via ChildCombinator or
DescendantCombinator selectors to select children of this specific
node. You can either create an IsSelector directly or simply pass
a node to a function that expects a walk filter:
>>> from ll.xist import parsers, xfind
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(doc[0]/xsc.Element):
... print repr(node)
...
<ll.xist.ns.html.head element object (13 children/no attrs) (from http://www.python.org/:6:?) at 0xb6c82f4c>
<ll.xist.ns.html.body element object (19 children/no attrs) (from http://www.python.org/:26:?) at 0xb6c3154c>def __init__(self, node):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass hasattr(Selector):
Selector that selects all element nodes that have an attribute with one of the specified Python names. For selecting nodes with global attributes the attribute class can be passed:
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html, xml
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(xfind.hasattr(xml.Attrs.lang)):
... print repr(node)
...
<ll.xist.ns.html.html element object (2 children/2 attrs) (from http://www.python.org/:4:?) at 0xb6d71d4c>def __init__(self, *attrnames):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass hasattr_xml(Selector):
hasattr_xml works similar to hasattr except that the
specified names are treated as XML names instead of Python names.
def __init__(self, *attrnames):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass attrhasvalue(Selector):
Selector that selects all element nodes where an attribute with the specified Python name has one of the specified values. For global attributes the attribute class can be passed. Note that "fancy" attributes (i.e. those containing non-text) will not be considered:
>>> from ll.xist import parsers, xfind
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(xfind.attrhasvalue("rel", "stylesheet")):
... print node.attrs.href
...
http://www.python.org/styles/screen-switcher-default.css
http://www.python.org/styles/netscape4.css
http://www.python.org/styles/print.cssdef __init__(self, attrname, *attrvalues):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass attrhasvalue_xml(Selector):
attrhasvalue_xml works similar to attrhasvalue except that
the specified name is treated as an XML name instead of a Python name.
def __init__(self, attrname, *attrvalues):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass attrcontains(Selector):
Selector that selects all element nodes where an attribute with the specified Python name contains one of the specified substrings in its value. For global attributes the attribute class can be passed. Note that "fancy" attributes (i.e. those containing non-text) will not be considered:
>>> from ll.xist import parsers, xfind
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(xfind.attrcontains("rel", "stylesheet")):
... print node.attrs.rel, node.attrs.href
...
stylesheet http://www.python.org/styles/screen-switcher-default.css
stylesheet http://www.python.org/styles/netscape4.css
stylesheet http://www.python.org/styles/print.css
alternate stylesheet http://www.python.org/styles/largestyles.css
alternate stylesheet http://www.python.org/styles/defaultfonts.cssdef __init__(self, attrname, *attrvalues):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass attrcontains_xml(Selector):
attrcontains_xml works similar to attrcontains except that
the specified name is treated as an XML name instead of a Python name.
def __init__(self, attrname, *attrvalues):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass attrstartswith(Selector):
Selector that selects all element nodes where an attribute with the specified Python name starts with any of the specified strings. For global attributes the attribute class can be passed. Note that "fancy" attributes (i.e. those containing non-text) will not be considered:
>>> from ll.xist import parsers, xfind
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(xfind.attrstartswith("class_", "input-")):
... print repr(node)
...
<input class="input-text" id="q" type="text" name="q" />
<input value="search" class="input-button" id="submit" type="submit" name="submit" />def __init__(self, attrname, *attrvalues):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass attrstartswith_xml(Selector):
attrstartswith_xml works similar to attrstartswith except
that the specified name is treated as an XML name instead of a Python name.
def __init__(self, attrname, *attrvalues):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass attrendswith(Selector):
Selector that selects all element nodes where an attribute with the specified Python name ends with one of the specified strings. For global attributes the attribute class can be passed. Note that "fancy" attributes (i.e. those containing non-text) will not be considered:
>>> from ll.xist import parsers, xfind
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(xfind.attrendswith("href", ".css")):
... print node.attrs.href
...
http://www.python.org/styles/screen-switcher-default.css
http://www.python.org/styles/netscape4.css
http://www.python.org/styles/print.css
http://www.python.org/styles/largestyles.css
http://www.python.org/styles/defaultfonts.cssdef __init__(self, attrname, *attrvalues):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass attrendswith_xml(Selector):
attrendswith_xml works similar to attrendswith except that
the specified name is treated as an XML name instead of a Python name.
def __init__(self, attrname, *attrvalues):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass hasid(Selector):
Selector that selects all element nodes where the id attribute has one
if the specified values:
>>> from ll.xist import parsers, xfind
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(xfind.hasid("logo")):
... print node.bytes()
...
<img src="http://www.python.org/images/python-logo.gif" id="logo" alt="homepage" border="0" />def __init__(self, *ids):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass hasclass(Selector):
Selector that selects all element nodes where the class attribute contains
one of the specified values:
>>> from ll.xist import parsers, xfind
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(xfind.hasclass("reference")):
... print node.bytes()
...
<a class="reference" href="http://www.python.org/search">Advanced Search</a>
<a href="http://www.python.org/about/success/rackspace" class="reference">Rackspace</a>
<a href="http://www.python.org/about/success/ilm" class="reference">Industrial Light and Magic</a>
<a href="http://www.python.org/about/success/astra" class="reference">AstraZeneca</a>
...def __init__(self, *classnames):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass Combinator(Selector):
A Combinator is a selector that transforms one or combines two or
more other selectors in a certain way.
class BinaryCombinator(Combinator):
A BinaryCombinator is a combinator that combines two selector:
the left hand selector and the right hand selector.
def __init__(self, left, right):
selfdef __str__(self):
selfclass ChildCombinator(BinaryCombinator):
A ChildCombinator is a BinaryCombinator. To match the
ChildCombinator the node must match the right hand selector and
it's immediate parent must match the left hand selector (i.e. it works
similar to the > combinator in CSS or the / combinator in XPath).
ChildCombinator objects can be created via the division operator
(/):
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(html.a/html.img):
... print node.bytes()
...
<img src="http://www.python.org/images/python-logo.gif" alt="homepage" id="logo" border="0" />
<img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" />
<img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" />
<img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />def matchpath(self, path):
selfclass DescendantCombinator(BinaryCombinator):
A DescendantCombinator is a BinaryCombinator. To match the
DescendantCombinator the node must match the right hand selector
and any of it's ancestor nodes must match the left hand selector (i.e. it
works similar to the descendant combinator in CSS or the // combinator
in XPath).
DescendantCombinator objects can be created via the floor division
operator (//):
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(html.div//html.img):
... print node.bytes()
...
<img id="skiptonav" alt="skip to navigation" src="http://www.python.org/images/trans.gif" border="0" />
<img id="skiptocontent" alt="skip to content" src="http://www.python.org/images/trans.gif" border="0" />
<img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />def matchpath(self, path):
selfclass AdjacentSiblingCombinator(BinaryCombinator):
A AdjacentSiblingCombinator is a BinaryCombinator.
To match the AdjacentSiblingCombinator the node must match the
right hand selector and the immediately preceding sibling must match the
left hand selector.
AdjacentSiblingCombinator objects can be created via the
multiplication operator (*). The following example outputs all links
inside those p elements that immediately follow a h2
element:
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(html.h2*html.p/html.a):
... print node.bytes()
...
<a href="http://www.scipy.org/SciPy2007" class="reference">SciPy Conference</a>
<a href="https://www.enthought.com/scipy07/" class="reference">early registration</a>
<a href="http://www.europython.org/sections/registration_issues/how-to-register" class="reference">Online registration</a>
<a href="http://europython.org/" class="reference">EuroPython 2007</a>
<a href="http://www.osdc.com.au/papers/cfp.html" class="reference">Call For Papers</a>
<a href="http://www.swa.hpi.uni-potsdam.de/dls07/" class="reference">DLS 2007</a>
<a href="http://pythonpapers.cgpublisher.com/" class="reference">The Python Papers</a>
<a href="http://www.pyconuk.org/" class="reference">PyCon UK</a>
<a href="http://www.pyconuk.org/submit.html" class="reference">proposals for talks</a>
<a href="http://www.pycon.it/registration/" class="reference">registration online</a>def matchpath(self, path):
selfclass GeneralSiblingCombinator(BinaryCombinator):
A GeneralSiblingCombinator is a BinaryCombinator.
To match the GeneralSiblingCombinator the node must match the
right hand selector and any of the preceding siblings must match the left
hand selector.
AdjacentSiblingCombinator objects can be created via the
exponentiation operator (**). The following example outputs all links
that are not the first links inside their parent (i.e. they have another
link among their preceding siblings):
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(html.a**html.a):
... print node.bytes()
...
<a href="http://www.python.org/about/success/ilm" class="reference">Industrial Light and Magic</a>
<a href="http://www.python.org/about/success/astra" class="reference">AstraZeneca</a>
<a href="http://www.python.org/about/success/honeywell" class="reference">Honeywell</a>
<a href="http://www.python.org/about/success" class="reference">and many others</a>
<a href="http://www.zope.org/">Zope</a>
...def matchpath(self, path):
selfclass ChainedCombinator(Combinator):
A ChainedCombinator combines any number of other selectors.
def __init__(self, *selectors):
selfdef __str__(self):
selfclass OrCombinator(ChainedCombinator):
An OrCombinator is a ChainedCombinator where the node must
match at least one of the selectors to match the OrCombinator. An
OrCombinator can be created with the binary or operator (|):
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(xfind.hasattr("href") | xfind.hasattr("src")):
... print node.attrs.href if "href" in node.Attrs else node.attrs.src
...
http://www.python.org/channews.rdf
http://aspn.activestate.com/ASPN/Cookbook/Python/index_rss
http://python-groups.blogspot.com/feeds/posts/default
http://www.showmedo.com/latestVideoFeed/rss2.0?tag=python
http://www.awaretek.com/python/index.xml
http://pyfound.blogspot.com/feeds/posts/default
http://www.python.org/dev/peps/peps.rss
http://www.python.org/community/jobs/jobs.rss
http://www.reddit.com/r/Python/.rss
http://www.python.org/styles/screen-switcher-default.css
http://www.python.org/styles/netscape4.css
http://www.python.org/styles/print.css
http://www.python.org/styles/largestyles.css
http://www.python.org/styles/defaultfonts.css
...def matchpath(self, path):
selfdef __or__(self, other):
selfclass AndCombinator(ChainedCombinator):
An AndCombinator is a ChainedCombinator where the node
must match all of the combined selectors to match the AndCombinator.
An AndCombinator can be created with the binary and operator
(&):
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(html.input & xfind.hasattr("id")):
... print node.bytes()
...
<input id="domains" name="domains" value="www.python.org" type="hidden" />
<input id="sitesearch" name="sitesearch" value="www.python.org" type="hidden" />
<input id="sourceid" name="sourceid" value="google-search" type="hidden" />
<input id="q" class="input-text" name="q" type="text" />
<input id="submit" value="search" name="submit" type="submit" class="input-button" />def matchpath(self, path):
selfdef __and__(self, other):
selfclass NotCombinator(Combinator):
A NotCombinator inverts the selection logic of the underlying
selector, i.e. a node matches only if it does not match the underlying
selector. A NotCombinator can be created with the unary inversion
operator (~).
The following example outputs all images that don't have a border
attribute:
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(html.img & ~xfind.hasattr("border")):
... print node.bytes()
...
<img alt="success story photo" class="success" src="http://www.python.org/images/success/nasa.jpg" />def __init__(self, selector):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass CallableSelector(Selector):
A CallableSelector is a selector that calls a user specified
callable to select nodes. The callable gets passed the path and must return
a bool specifying whether this path is selected. A CallableSelector
is created implicitely whenever a callable is passed to a method that
expects a walk filter.
The following example outputs all links that point outside the python.org
domain:
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> def foreignlink(path):
... return path and isinstance(path[-1], html.a) and not path[-1].attrs.href.asURL().server.endswith(".python.org")
...
>>> for node in doc.walknode(foreignlink):
... print node.bytes()
...
<a href="http://youtube.com/" class="reference">YouTube.com</a>
<a href="http://www.zope.org/">Zope</a>
<a href="http://www.djangoproject.com/">Django</a>
<a href="http://www.turbogears.org/">TurboGears</a>
<a href="http://pyxml.sourceforge.net/topics/">XML</a>
..def __init__(self, func):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass nthchild(Selector):
An nthchild object is a selector that selects every node that is
the n-th child of its parent. E.g. nthchild(0) selects every first
child, nthchild(-1) selects each last child. Furthermore
nthchild("even") selects each first, third, fifth, ... child and
nthchild("odd") selects each second, fourth, sixth, ... child.
def __init__(self, index):
selfdef matchpath(self, path):
selfdef __str__(self):
selfclass nthoftype(Selector):
An nthoftype object is a selector that selects every node that is
the n-th node of a specified type among its siblings. Similar to
nthchild nthoftype supports negative and positive indices
as well as "even" and "odd". Which types are checked can be passed
explicitly. If no types are passed the type of the node itself is used:
>>> from ll.xist import parsers, xfind
>>> from ll.xist.ns import html
>>> doc = parsers.parseurl("http://www.python.org", tidy=True)
>>> for node in doc.walknode(xfind.nthoftype(0, html.h2)):
... print node.bytes()
...
<h2 class="news">SciPy 2007 - Conference for Scientific Computing</h2>def __init__(self, index, *types):
selfdef _find(self, path):
selfdef matchpath(self, path):
selfdef __str__(self):
self