====== 得到HTML指定ID的内容 ====== 大多时候,我们需要得到一个HTML文件中指定的内容,比如得到指定ID的内容,写了个脚本,就实现了这个功能,和大家一起分享一下。 ===== Python脚本 ===== ## # created by gudonghua#gmail.com # posted in http://www.pythonclub.org @ 2011-12-23 # ## import re import os IGNORE_TAGS_START = [ "" ] def get_id_tag(content, id_name): id_name = id_name.strip() patt_id_tag = """<[^>]*id=['"]?""" + id_name + """['" ][^>]*>""" id_tag = re.findall(patt_id_tag, content, re.DOTALL|re.IGNORECASE) if id_tag: id_tag = id_tag[0] return id_tag def find_all_tags(content): tag_patt = """<[^>]*>""" tags = re.findall(tag_patt, content) return tags def get_html_id(content, id_name): tag_content = "" all_tags = find_all_tags(content) id_tag = get_id_tag(content, id_name) print "id_tag", id_tag tag_stack = [] if not id_tag: return "" in_tag = 0 id_content = "" index = 0 for tag in all_tags: if in_tag == 0 and tag == id_tag: tag_stack.append(tag) start_index = content.find(tag) index = start_index + len(tag) in_tag = 1 print "in_tag", tag elif in_tag == 1: print len(tag_stack), tag_stack[0:2] ignore_flag = 0 for t in IGNORE_TAGS_START: if tag.startswith(t): ignore_flag = 1; break for t in IGNORE_TAGS_END: if tag.endswith(t): ignore_flag = 1; break if ignore_flag: continue if tag.startswith("