Python俱乐部
Python
小课题
京东优惠券
大多时候,我们需要得到一个HTML文件中指定的内容,比如得到指定ID的内容,写了个脚本,就实现了这个功能,和大家一起分享一下。
## # created by gudonghua#gmail.com # posted in http://www.pythonclub.org @ 2011-12-23 # ## import re import os IGNORE_TAGS_START = [ "<img", "<br", "<!" ] IGNORE_TAGS_END = [ "/>" ] def get_id_tag(content, id_name): id_name = id_name.strip() patt_id_tag = """<[^>]*id=['"]?""" + id_name + """['" ][^>]*>""" id_tag = re.findall(patt_id_tag, content, re.DOTALL|re.IGNORECASE) if id_tag: id_tag = id_tag[0] return id_tag def find_all_tags(content): tag_patt = """<[^>]*>""" tags = re.findall(tag_patt, content) return tags def get_html_id(content, id_name): tag_content = "" all_tags = find_all_tags(content) id_tag = get_id_tag(content, id_name) print "id_tag", id_tag tag_stack = [] if not id_tag: return "" in_tag = 0 id_content = "" index = 0 for tag in all_tags: if in_tag == 0 and tag == id_tag: tag_stack.append(tag) start_index = content.find(tag) index = start_index + len(tag) in_tag = 1 print "in_tag", tag elif in_tag == 1: print len(tag_stack), tag_stack[0:2] ignore_flag = 0 for t in IGNORE_TAGS_START: if tag.startswith(t): ignore_flag = 1; break for t in IGNORE_TAGS_END: if tag.endswith(t): ignore_flag = 1; break if ignore_flag: continue if tag.startswith("</"): tag_stack.pop() else: tag_stack.append(tag) index = content.find(tag, index) index += len(tag) if not tag_stack: id_content = content[start_index: index] break return id_content if __name__ == "__main__": content = open("ft2.htm").read() print get_html_id(content, "bodytext")