Module bs_processors.xml_util
Utilities for working with BeautifulSoup trees
This module contains utilities useful for implementing custom processors and predicates.
Expand source code
"""
Utilities for working with BeautifulSoup trees
This module contains utilities useful for implementing custom processors and predicates.
"""
from typing import Any, Callable, List
from bs4 import BeautifulSoup
def set_new_children(elm, children):
"""
Sets new children to an element
* **elm**: the element to have its children set
* **children**: a sequence of children
* **return**: the element
>>> xml = BeautifulSoup("<root>X<a/>X<b/>X</root>",'xml')
>>> xml
<?xml version="1.0" encoding="utf-8"?>
<root>X<a/>X<b/>X</root>
>>> new_children = [xml.new_tag("x"), xml.new_tag("y")]
>>> set_new_children( xml.root, new_children)
<root><x/><y/></root>
"""
new_children = list(children)
if list(elm.children) == new_children:
return elm
elm.clear()
for child in new_children:
elm.append(child)
return elm
def process_children(processor: Callable[[Any], List[Any]], elm):
"""
Processes the children of an element
* **processor**: A processor that takes an element and returns a list of elements
* **elm**: the element to process
* **return**: A list with the joined results of processing each child with the provided processor
>>> doc = BeautifulSoup("<div>a<span>b</span>c<p>d<span>e</span>f<a/>g</p>h</div>", "html.parser")
>>> counter = [0]
>>> def processor(elm):
... counter[0] +=1
... elm['id'] = str(counter[0])
... return [elm]
>>> process_children(processor, doc.div)
['a', <span id="1">b</span>, 'c', <p id="2">d<span>e</span>f<a></a>g</p>, 'h']
"""
result = []
for child in elm.children:
if is_tag(child):
result += processor(child)
else:
result.append(child)
return result
def is_tag(elm):
"""
True if the passed object is a HTML/XML tag
* **elm**:
* **return**:
>>> from bs4 import BeautifulSoup as bs
>>> doc = bs("<span></span><p>bubu</p>", "html.parser")
>>> span = doc.span
>>> is_tag(doc.span)
True
>>> is_tag(doc.p)
True
>>> is_tag(list(doc.p.children)[0])
False
"""
return elm is not None and elm.name is not None
_bs = BeautifulSoup("<a/>", "html.parser")
def copy_element_type(elm):
"""
Creates a new element with the same tag name as the passed element
>>> xml = BeautifulSoup("<root><abc/></root>", "xml")
>>> copy_element_type(xml.root.abc)
<abc></abc>
"""
return _bs.new_tag(elm.name)
Functions
def copy_element_type(elm)
-
Creates a new element with the same tag name as the passed element
>>> xml = BeautifulSoup("<root><abc/></root>", "xml") >>> copy_element_type(xml.root.abc) <abc></abc>
Expand source code
def copy_element_type(elm): """ Creates a new element with the same tag name as the passed element >>> xml = BeautifulSoup("<root><abc/></root>", "xml") >>> copy_element_type(xml.root.abc) <abc></abc> """ return _bs.new_tag(elm.name)
def is_tag(elm)
-
True if the passed object is a HTML/XML tag
- elm:
- return:
>>> from bs4 import BeautifulSoup as bs >>> doc = bs("<span></span><p>bubu</p>", "html.parser") >>> span = doc.span >>> is_tag(doc.span) True >>> is_tag(doc.p) True >>> is_tag(list(doc.p.children)[0]) False
Expand source code
def is_tag(elm): """ True if the passed object is a HTML/XML tag * **elm**: * **return**: >>> from bs4 import BeautifulSoup as bs >>> doc = bs("<span></span><p>bubu</p>", "html.parser") >>> span = doc.span >>> is_tag(doc.span) True >>> is_tag(doc.p) True >>> is_tag(list(doc.p.children)[0]) False """ return elm is not None and elm.name is not None
def process_children(processor: Callable[[Any], List[Any]], elm)
-
Processes the children of an element
- processor: A processor that takes an element and returns a list of elements
- elm: the element to process
- return: A list with the joined results of processing each child with the provided processor
>>> doc = BeautifulSoup("<div>a<span>b</span>c<p>d<span>e</span>f<a/>g</p>h</div>", "html.parser") >>> counter = [0] >>> def processor(elm): ... counter[0] +=1 ... elm['id'] = str(counter[0]) ... return [elm] >>> process_children(processor, doc.div) ['a', <span id="1">b</span>, 'c', <p id="2">d<span>e</span>f<a></a>g</p>, 'h']
Expand source code
def process_children(processor: Callable[[Any], List[Any]], elm): """ Processes the children of an element * **processor**: A processor that takes an element and returns a list of elements * **elm**: the element to process * **return**: A list with the joined results of processing each child with the provided processor >>> doc = BeautifulSoup("<div>a<span>b</span>c<p>d<span>e</span>f<a/>g</p>h</div>", "html.parser") >>> counter = [0] >>> def processor(elm): ... counter[0] +=1 ... elm['id'] = str(counter[0]) ... return [elm] >>> process_children(processor, doc.div) ['a', <span id="1">b</span>, 'c', <p id="2">d<span>e</span>f<a></a>g</p>, 'h'] """ result = [] for child in elm.children: if is_tag(child): result += processor(child) else: result.append(child) return result
def set_new_children(elm, children)
-
Sets new children to an element
- elm: the element to have its children set
- children: a sequence of children
- return: the element
>>> xml = BeautifulSoup("<root>X<a/>X<b/>X</root>",'xml') >>> xml <?xml version="1.0" encoding="utf-8"?> <root>X<a/>X<b/>X</root> >>> new_children = [xml.new_tag("x"), xml.new_tag("y")] >>> set_new_children( xml.root, new_children) <root><x/><y/></root>
Expand source code
def set_new_children(elm, children): """ Sets new children to an element * **elm**: the element to have its children set * **children**: a sequence of children * **return**: the element >>> xml = BeautifulSoup("<root>X<a/>X<b/>X</root>",'xml') >>> xml <?xml version="1.0" encoding="utf-8"?> <root>X<a/>X<b/>X</root> >>> new_children = [xml.new_tag("x"), xml.new_tag("y")] >>> set_new_children( xml.root, new_children) <root><x/><y/></root> """ new_children = list(children) if list(elm.children) == new_children: return elm elm.clear() for child in new_children: elm.append(child) return elm