====== 得到HTML指定ID的内容 ======
大多时候,我们需要得到一个HTML文件中指定的内容,比如得到指定ID的内容,写了个脚本,就实现了这个功能,和大家一起分享一下。
===== Python脚本 =====
##
# created by gudonghua#gmail.com
# posted in http://www.pythonclub.org @ 2011-12-23
#
##
import re
import os
IGNORE_TAGS_START = [
""
]
def get_id_tag(content, id_name):
id_name = id_name.strip()
patt_id_tag = """<[^>]*id=['"]?""" + id_name + """['" ][^>]*>"""
id_tag = re.findall(patt_id_tag, content, re.DOTALL|re.IGNORECASE)
if id_tag:
id_tag = id_tag[0]
return id_tag
def find_all_tags(content):
tag_patt = """<[^>]*>"""
tags = re.findall(tag_patt, content)
return tags
def get_html_id(content, id_name):
tag_content = ""
all_tags = find_all_tags(content)
id_tag = get_id_tag(content, id_name)
print "id_tag", id_tag
tag_stack = []
if not id_tag:
return ""
in_tag = 0
id_content = ""
index = 0
for tag in all_tags:
if in_tag == 0 and tag == id_tag:
tag_stack.append(tag)
start_index = content.find(tag)
index = start_index + len(tag)
in_tag = 1
print "in_tag", tag
elif in_tag == 1:
print len(tag_stack), tag_stack[0:2]
ignore_flag = 0
for t in IGNORE_TAGS_START:
if tag.startswith(t):
ignore_flag = 1; break
for t in IGNORE_TAGS_END:
if tag.endswith(t):
ignore_flag = 1; break
if ignore_flag:
continue
if tag.startswith(""):
tag_stack.pop()
else:
tag_stack.append(tag)
index = content.find(tag, index)
index += len(tag)
if not tag_stack:
id_content = content[start_index: index]
break
return id_content
if __name__ == "__main__":
content = open("ft2.htm").read()
print get_html_id(content, "bodytext")