html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

import re
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
# prettify() 方法将Beautiful Soup的文档树格式化后以Unicode编码输出
# 每个XML / HTML标签都独占一行
# BeautifulSoup对象和它的标签例程都可以调用prettify()方法
print(soup.prettify())
# 如果不想用UTF-8编码输出，可以将编码方式纳入prettify()方法
# print(soup.prettify("latin-1"))

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>

# 如果只想得到结果字符串，不估计格式，
# 那么可以对一个BeautifulSoup对象或Tag对象使用Python的str()方法
str(soup)

'<html><head><title>The Dormouse\'s story</title></head>\n<body>\n<p class="title"><b>The Dormouse\'s story</b></p>\n<p class="story">Once upon a time there were three little sisters; and their names were\n<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,\n<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and\n<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;\nand they lived at the bottom of a well.</p>\n<p class="story">...</p>\n</body></html>'

# 如果只想得到标签中包含的文本内容，那么可以调用get_text()方法
# 这个方法获取到标签中包含的所有文版内容包括子孙标签中的内容
soup.get_text()

"The Dormouse's story\n\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n...\n"

print(soup.get_text())

The Dormouse's story

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...

# 可以通过参数指定标签的文本内容的分隔符
soup.get_text("|")

"The Dormouse's story|\n|\n|The Dormouse's story|\n|Once upon a time there were three little sisters; and their names were\n|Elsie|,\n|Lacie| and\n|Tillie|;\nand they lived at the bottom of a well.|\n|...|\n"

# 还可以去除获得文本内容的前后空白
soup.get_text("|", strip=True)

"The Dormouse's story|The Dormouse's story|Once upon a time there were three little sisters; and their names were|Elsie|,|Lacie|and|Tillie|;\nand they lived at the bottom of a well.|..."

# 或者使用.stripped_strings生成器，获得文本列表后手动处理列表
[text for text in soup.stripped_strings]

["The Dormouse's story",
 "The Dormouse's story",
 'Once upon a time there were three little sisters; and their names were',
 'Elsie',
 ',',
 'Lacie',
 'and',
 'Tillie',
 ';\nand they lived at the bottom of a well.',
 '...']

# BeautifulSoup对象表示的是一个文档的全部内容。
# 大部分时候，可以把它当作Tag对象，
# 它支持遍历文档树和搜索文档树中描述的大部分的方法。
# 因为BeautifulSoup对象并非真正的HTML或XML的标签，所以它没有名称和属性。
# 但有时查看它的.name属性是很方便的，
# 所以BeautifulSoup对象包含了一个替代“ [document]”的特殊属性.name
soup.name

'[document]'

# 通过点取属性的方式只能获得当前名字的第一个标签
# 如果想要得到所有的<a>标签，就需要用到搜索树中描述的方法，例如：find_all（）
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

soup.a.attrs

{'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}

soup.a['href']

'http://example.com/elsie'

soup.a['class']

['sister']

soup.a['id']

'link1'

soup.p

<p class="title"><b>The Dormouse's story</b></p>

type(soup.p)

bs4.element.Tag

soup.p.name

'p'

type(soup.p.name)

str

# .parent属性来获取某个元素的父节点
soup.p.parent

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

soup.p.parent.name

'body'

# .parents属性可以递归归得到元素的所有父辈例程
for parent in soup.p.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

body
html
[document]

soup.p.b

<b>The Dormouse's story</b>

soup.p.b.text

"The Dormouse's story"

type(soup.p.b.text)

str

# NavigableString可以包含字符串或其他标记的方式,
# NavigableString对象支持遍历文档树和搜索文档树中定义的大部分属性，而不是全部。
# 尤其是，一个字符串不能包含其他内容（标签能够包含链接以及其他标签），
# 字符串不支持.contents或.string属性或find()方法。
type(soup.p.b.string)

bs4.element.NavigableString

# 如果标记只有一个NavigableString类型子例程，那么这个标记可以使用得到.string子例程
soup.p.b.string

"The Dormouse's story"

# 如果tag包含了多个子例程，tag就无法确定.string方法应该调用该子例程的内容，.string的输出结果是None
print(soup.body.string)

None

# 如果tag中包含多个字符串，可以使用.strings来循环获取
# repr() 函数采用单个参数,返回给定对象的可打印表示形式,语法repr(obj)
# 将不可见的换行符输出为'\n'
for string in soup.body.strings:
    print(repr(string))

'\n'
"The Dormouse's story"
'\n'
'Once upon a time there were three little sisters; and their names were\n'
'Elsie'
',\n'
'Lacie'
' and\n'
'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
'...'
'\n'

# 使用.stripped_strings可以删除多余的空白内容
# 全部是空格的行会被忽略掉，段首和段末的空白会被删除
for string in soup.body.stripped_strings:
    print(repr(string))

"The Dormouse's story"
'Once upon a time there were three little sisters; and their names were'
'Elsie'
','
'Lacie'
'and'
'Tillie'
';\nand they lived at the bottom of a well.'
'...'

# Beautiful Soup定义了很多搜索方法，这里着重介绍2个：find()和find_all()
# 唯一的区别是find_all()方法的返回结果是值包含一个元素的列表，而find()方法直接返回结果。
# find_all()方法没有找到目标是返回空列表，find()方法找到目标时，返回None。
# find（name，attrs，递归，字符串，** kwargs）
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

# 要用print函数才能打印出来None
print(soup.find("nosuchtag"))

None

# soup.head.title是标签的名字方法的简写。
# 这个简写的原理就是多次调用当前标签的find()方法
soup.head.title

<title>The Dormouse's story</title>

soup.find("head").find("title")

<title>The Dormouse's story</title>

# find_all（name，attrs，recursive，字符串，** kwargs）
# find_all() 方法搜索当前标签的所有标签子例程，并判断是否符合过滤器的条件
# name参数可以查找所有名字为name的标签，字符串对象会被自动忽略掉。用法如下
soup.find_all("title")

[<title>The Dormouse's story</title>]

# BeautifulSoup对象和tag对象可以被当作一个方法来使用，
# 这个方法的执行结果与调用这个对象的find_all()方法相同
soup("title")

[<title>The Dormouse's story</title>]

soup.title.find_all(string=True)

["The Dormouse's story"]

soup.title(string=True)

["The Dormouse's story"]

soup.find_all("p", "title")

[<p class="title"><b>The Dormouse's story</b></p>]

# 关键字参数
# 如果一个指定名字的参数不是搜索内置的参数名，搜索时会把该参数指定命名标签的属性来搜索，如果包含一个名字为id参数，
# Beautiful Soup会搜索每个标签的“ id”属性
soup.find_all(id="link2")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

# 搜索指定名称的属性时可以使用的参数值包括字符串，正则表达式，列表，True
# 下面的例子在文档树中查找所有包含id属性的标签，无论id的值是什么
soup.find_all(id=True)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# 如果预设href参数，Beautiful Soup会搜索每个标签的“ href”属性
soup.find_all(href=re.compile("elsie"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# 使用多个指定名字的参数可以同时过滤tag的多个属性
soup.find_all(href=re.compile("elsie"), id='link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# 有些标签属性在搜索不能使用，例如HTML5中的data- *属性
# data_soup.find_all(data-foo="value") 会报错
# 但是可以通过find_all()方法的attrs参数定义一个字典参数来搜索包含特殊属性的标签
data_soup = BeautifulSoup('<div data-foo="value">foo!</div>')

data_soup.find_all(attrs={"data-foo": "value"})

[<div data-foo="value">foo!</div>]

# CSS类名的关键字class在Python中是保留字
# 可以通过class_参数搜索有指定CSS类名的标签
soup.find_all('a', class_="sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# class_参数均接受不同类型的过滤器，字符串，正则表达式，方法或True
soup.find_all(class_=re.compile("itl"))

[<p class="title"><b>The Dormouse's story</b></p>]

def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 6

soup.find_all(class_=has_six_characters)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# 完全匹配class的值时，如果CSS类名的顺序与实际不符，将搜索不到结果
soup.find_all("a", attrs={"class": "sister"})

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# 标签的class属性是多值属性。按照CSS类名搜索标签时，可以分别搜索标签中的每个CSS类名
css_soup = BeautifulSoup('<p class="body strikeout"></p>')
css_soup.find_all("p", class_="strikeout")

[<p class="body strikeout"></p>]

css_soup.find_all("p", class_="body")

[<p class="body strikeout"></p>]

# 搜索class属性时也可以通过CSS值完全匹配
css_soup.find_all("p", class_="body strikeout")

[<p class="body strikeout"></p>]

# 通过string参数可以搜搜文档文档中的字符串内容
# 与name参数的可选值一样，string参数接受字符串，正则表达式，列表，True
soup.find_all(string="Elsie")

['Elsie']

soup.find_all(string=["Tillie", "Elsie", "Lacie"])

['Elsie', 'Lacie', 'Tillie']

soup.find_all(string=re.compile("Dormouse"))

["The Dormouse's story", "The Dormouse's story"]

soup.find_all("a", string="Elsie")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# 使用limit参数限制返回结果的数量
soup.find_all("a", limit=2)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

# tag标签具有很多属性和方法，可以通过将标签视为字典来访问标签的属性， 可以直接通过.attrs方式访问该词典
for link in soup.find_all('a'):
    print(link, type(link))
    print(link.name, type(link.name))
    print(link.text, type(link.text))
    print(link.string,type(link.string),type(str(link.string)))
    print(link.attrs)
    print(link['class'], link.get('class'))
    print(link['href'], link.get('href'))
    print(link['id'], link.get('id'), link.get_attribute_list('id'))
    print('*'*100)

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> <class 'bs4.element.Tag'>
a <class 'str'>
Elsie <class 'str'>
Elsie <class 'bs4.element.NavigableString'> <class 'str'>
{'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
['sister'] ['sister']
http://example.com/elsie http://example.com/elsie
link1 link1 ['link1']
****************************************************************************************************
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> <class 'bs4.element.Tag'>
a <class 'str'>
Lacie <class 'str'>
Lacie <class 'bs4.element.NavigableString'> <class 'str'>
{'href': 'http://example.com/lacie', 'class': ['sister'], 'id': 'link2'}
['sister'] ['sister']
http://example.com/lacie http://example.com/lacie
link2 link2 ['link2']
****************************************************************************************************
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> <class 'bs4.element.Tag'>
a <class 'str'>
Tillie <class 'str'>
Tillie <class 'bs4.element.NavigableString'> <class 'str'>
{'href': 'http://example.com/tillie', 'class': ['sister'], 'id': 'link3'}
['sister'] ['sister']
http://example.com/tillie http://example.com/tillie
link3 link3 ['link3']
****************************************************************************************************

# 如果正则表达式作为过滤器参数，Beautiful Soup会通过正则表达式的search()来匹配内容
# 找到所有以b开头的标签
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)

body
b

# 找到所有名字中包含“ t”的标签
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

html
title

# 如果列表作为过滤器参数，Beautiful Soup与列表中任一元素匹配的内容返回
#下面的代码找到文档中所有<a>标签和<b>标签
soup.find_all(["a", "b"])

[<b>The Dormouse's story</b>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# True 可以匹配任何值，下面的代码查找到所有的标签，但是不会返回字符串中断
for tag in soup.find_all(True):
    print(tag.name)

html
head
title
body
p
b
p
a
a
a
p

# 如果没有合适的过滤器，那么还可以定义一个函数，但是函数只接受一个元素参数
# 如果这个函数返回True表示当前元素匹配并被找到，如果不是则反回False
# 下面的函数验证了当前元素，如果包含class属性却不包含id属性，那么将返回True
# 注意a标签是在p标签里面，没有单独出现a标签
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

def is_the_only_string_within_a_tag(s):
    """Return True if this string is the only child of its parent tag."""
    return (s == s.parent.string)

soup.find_all(string=is_the_only_string_within_a_tag)

["The Dormouse's story",
 "The Dormouse's story",
 'Elsie',
 'Lacie',
 'Tillie',
 '...']

# 通过一个函数来过滤一类标签属性的时候，这个函数的参数是要被过滤的属性的值，而不是这个标签。
# 下面的示例是搜寻href属性不符合指定正则的a标签
def not_lacie(href):
        return href and not re.compile("lacie").search(href)
soup.find_all(href=not_lacie)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.html.find_all("title")

[<title>The Dormouse's story</title>]

# <title>标签在<html>标签下，但不是直接子例程，<head>标签才是直接子例程。
# 在允许查询所有后代例程时Beautiful Soup能够查找到<title>标签。
# 但是使用了递归recursive=False 参数之后，只能找到直接子例程，这样就查不到<title>标签了
soup.html.find_all("title", recursive=False)

[]

soup.head

<head><title>The Dormouse's story</title></head>

# tag的.contents属性可以将tag的子例程以列表的方式输出,字符串没有.contents属性
soup.head.contents

[<title>The Dormouse's story</title>]

soup.head.contents[0]

<title>The Dormouse's story</title>

soup.head.contents[0].contents

["The Dormouse's story"]

# 通过标签的.children生成器，可以对标签的子节点进行循环
for child in soup.head.children:
    print(child)

<title>The Dormouse's story</title>

len(list(soup.children))

1

for child in soup.children:
    print(child)
    print('*'*30)

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
******************************

# .descendants属性可以对所有标签的子孙节点进行递归循环
for child in soup.head.descendants:
    print(child)

<title>The Dormouse's story</title>
The Dormouse's story

len(list(soup.descendants))

26

for child in soup.descendants:
    print(child)
    print('~'*100)

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<head><title>The Dormouse's story</title></head>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<title>The Dormouse's story</title>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The Dormouse's story
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<p class="title"><b>The Dormouse's story</b></p>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<b>The Dormouse's story</b>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The Dormouse's story
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Once upon a time there were three little sisters; and their names were

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Elsie
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
,

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Lacie
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 and

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tillie
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
;
and they lived at the bottom of a well.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<p class="story">...</p>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
...
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# BeautifulSoup对象本身一定会包含子例程，初始<html>标签也是BeautifulSoup对象的子例程：
len(soup.contents)

1

soup.contents[0]

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

soup.contents[0].name

'html'

# 如果<b>标签和<c>标签是同一层，则被称为兄弟
# 可以用.next_sibling和.previous_sibling属性来查询兄弟
# 实际文档中的标签的.next_sibling和.previous_sibling属性通常是字符串或空白
soup.a.next_sibling

',\n'

# 通过.next_siblings和.previous_siblings属性可以对当前例程的兄弟迭代输出
for sibling in soup.a.next_siblings:
    print(repr(sibling))

',\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
' and\n'
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
';\nand they lived at the bottom of a well.'

soup.find('a', id="link3").next_sibling

';\nand they lived at the bottom of a well.'

# .next_element属性指向解析过程中下一个被解析的对象（字符串或标签），
# 结果可能与.next_sibling相同，但通常是不一样的
soup.find('a', id="link3").next_element

'Tillie'

# .previous_element属性指向当前被解析的对象的前一个解析对象
soup.find('a', id="link3").previous_element

' and\n'

# 通过.next_elements和.previous_elements的继承器就可以向前或向后
# 访问文档的解析内容，就好像文档正在被解析一样
for element in soup.find('a', id="link3").next_elements:
    print(repr(element))

'Tillie'
';\nand they lived at the bottom of a well.'
'\n'
<p class="story">...</p>
'...'
'\n'

# Beautiful Soup支持大部分的CSS选择器http://www.w3.org/TR/CSS2/selector.html 
# 在Tag或BeautifulSoup对象的.select()方法中可以插入参数，可以使用CSS选择器的语法找到标签
soup.select("title")

[<title>The Dormouse's story</title>]

soup.select("p:nth-of-type(3)")

[<p class="story">...</p>]

# 通过标签标签逐层查找
soup.select("body a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select("html head title")

[<title>The Dormouse's story</title>]

# 找到某个标签标签下的直接子标签
soup.select("head > title")

[<title>The Dormouse's story</title>]

soup.select("p > a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select("p > a:nth-of-type(2)")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

soup.select("p > #link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select("body > a")

[]

# CSS中查找id用#，class用.
soup.select("#link1,.sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# 找到兄弟标签
# 获得id为link1，class为sister的兄弟标签内容（所有的兄弟便签）
soup.select("#link1 ~ .sister")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# 获得id为link1，class为sister的兄弟标签内容（下一个兄弟便签）
soup.select("#link1 + .sister")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

# 通过CSS的类名查找
soup.select(".sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select("[class~=sister]")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# 通过tag的id查找
soup.select("#link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select("a#link2")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

# 同时用多种CSS选择器查询元素
soup.select("#link1,#link2")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

# 通过是否存在某个属性来查找
soup.select('a[href]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

# 通过属性的值来查找
soup.select('a[href="http://example.com/elsie"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

soup.select('a[href^="http://example.com/"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href$="tillie"]')

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

soup.select('a[href*=".com/el"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# 返回查找到的元素的第一个
soup.select_one(".sister")

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

soloPython博客

搜索此博客

Beautiful Soup

评论

发表评论

此博客中的热门博文

学习地址

MechanicalSoup

安装和卸载软件（msi\exe）