スラドのsearch.plを読み込んでインデックスリストを得る

正規表現やってた成果がこれ．

# -*- coding: utf-8 -*-
"""
slashdotjp.py
スラッシュドットジャパンの記事リストおよび記事，コメントを読み込むモジュール
"""
import urllib,re
START_INDEX_LIST = "<!-- start template: ID 45, storysearch;search;default -->"
END_INDEX_LIST = "<!-- end template: ID 45, storysearch;search;default -->"
def get_index():
    """
    記事リストを取得するモジュール 
    
    記事リストで取得できる記事のメタデータは次の通り
    ・(surl)url
    ・(sid)ストーリーid
    ・(year,month,day)投稿日
    ・(title)タイトル
    ・(comments)コメント数
    ・(formerbody)記事本文の前半部分
    ・(securl)セクションurl
    ・(secname)セクション
    ・(topics)トピック，トピックidのディクショナリのリスト
    """
    # 最新のトップページ相当のインデックスを含む検索結果を取得し文字列に
    s = urllib.urlopen('http://slashdot.jp/search.pl').read()
    #s = file(r"C:\workspace\scanslashdotjp\trunk\searchresult.htm").read() #test用
    s = unicode(s,'utf-8')
    # \n\tを削る
    s = s.replace('\n','')
    s = s.replace('\t','')
    # 前後のよけいな部分を削る
    s = s.split(START_INDEX_LIST)[1]
    s = s.split(END_INDEX_LIST)[0]
    index = []
    pos = 0
    while True:
        story = {}
        index.append(story)
        #url,ストーリーid,見出しをゲット
        p = re.compile(unicode('<a href="(?P<surl>.*?=(?P<sid>(?P<year>\d+?)/(?P<month>\d+?)/(?P<day>\d+?)/.*?))">(?P<title>.*?)</a>','utf-8'))
        m = p.search(s,pos)
        story.update(m.groupdict())
        pos = m.end()
        #コメント数と本文の前半をゲット
        p = re.compile(unicode('<br><font size="-1">.+?、(?P<comments>\d+?)個のコメント</font><br>(?P<formerbody>.+?)<br>','utf-8'))
        m = p.search(s,pos)
        story.update(m.groupdict())
        pos = m.end()
        #セクションurlとセクションの名前をゲット
        p = re.compile(unicode('<font size="-1"><a href="(?P<securl>.+?)">(?P<secname>.+?)</a> &gt;','utf-8'))
        m = p.search(s,pos)
        story.update(m.groupdict())
        pos = m.end()
        #トピック(複数)をゲット
        p = re.compile(unicode('(?P<topics>.+?)<br> </font><p>','utf-8'))
        m = p.search(s,pos)
        story['topics'] = []
        for sub_s in m.group('topics').split(',&nbsp;'):
            t = {}
            sub_p = re.compile(unicode('<a href="(?:.+?=)(?P<tid>\d+?)">(?P<topic>.+?)</a>','utf-8'))
            sub_m = sub_p.search(sub_s)
            t.update(sub_m.groupdict())
            story['topics'].append(t)
        pos = m.end()
            
        if s[pos:pos+9] == '<A HREF="':
            break
        
    for story in index:
        print story['sid'],story['title']
    

def get_story(url):
    """
    単一の記事の本文とコメントを取得するモジュール
    """
    #まだ実装してません
if __name__ == "__main__":
    get_index()

まだメソッドの引数のインターフェースをどうするか決めてません．
ディクショナリのリスト(rubyではハッシュの配列と呼んでるらし)にする仕様はhttp://slashdot.jp/code.shtmlのhswから拝借しましたがこういうときGPLはうつるんだろうか…

こもろぐ @tenkoma

What We Find Changes Who We Become -- Peter Morville著『アンビエント・ファインダビリティ』

スラドのsearch.plを読み込んでインデックスリストを得る