|
我发分享一个用Python的小程序,在CSDN上找的
复制内容到剪贴板
代码:
import re from Parser import * class NewsParser(Parser): """ doParse 这个方法必须实现, 参数page为一个dict,page有两个键-url和html,通过page['url']可以获得url,page['html']获得网页的HTML """ def doParse(self,page): self.URL = page['url'] self.Html = page['html'] result={} result['url'] = page['url'] sPattern='notice(?P<code>\d{6})_(?P<id>\d{1,7})\.html' result.update(self.suckItem(page['url'], sPattern)) sPattern='<h1>(?P<title>.*?)</h1>.*?<div class="artibody" id="artibody">(?P<content>.*?)</div>' result.update(self.suckItem(page['html'], sPattern)) if result.has_key('title'): print result['url'],result['title'] return result def suckItem(self,str,sPattern): pattern = re.compile(sPattern,re.IGNORECASE | re.DOTALL) m=pattern.search(str) if m: return m.groupdict() else: return {}

OVER
|