Reading phrase table (for Moses) using Python

I’m going to analyze phrase table that is generated by Moses. So I have studied phrase table format from http://www.statmt.org/moses/?n=FactoredTraining.ScorePhrases and written a Python script for reading a phrase table into Python dict. The code is as follow.

import re

def _decode_tokens(field):
    return filter(lambda t: t != '', re.split(" ", field))

def _decode_link(link):
    m = re.match("\((.*)\)", link)
    if m:
        toks = filter(lambda l: l != '', re.split(",", m.group(1)))
        return map(lambda l: int(l), toks)
    else:
        raise RuntimeError

def _decode_links(field):
    links = filter(lambda t: t != '', re.split(" ", field))
    return map(_decode_link, links)

def _decode_num(field):
    toks = filter(lambda t: t != '', re.split(" ", field))
    return map(lambda tok: float(tok), toks)

def read_phrase_table(filename):
    NUM_FIELD = 5
    for i, line in enumerate(open(filename)):
        fields = re.split("\|\|\|", line.strip())
        if len(fields) != NUM_FIELD:
            raise RuntimeError
        phrase = {}
        phrase['source'] = _decode_tokens(fields[0])
        phrase['target'] = _decode_tokens(fields[1])
        phrase['links'] = _decode_links(fields[2])
        phrase['rev_links'] = _decode_links(fields[3])
        nums = _decode_num(fields[4])
        phrase['phrase_trans_prob'] = nums[0]
        phrase['lex_weight'] = nums[1]
        phrase['rev_phrase_trans_prob'] = nums[2]
        phrase['rev_lex_weight'] = nums[3]
        phrase['phrase_penalty'] = nums[4]
        yield phrase

def main():
    for phrase in read_phrase_table("phrase-table.0-0"):
        print phrase

if __name__ == '__main__':
    main()

ใส่ความเห็น

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / เปลี่ยนแปลง )

Twitter picture

You are commenting using your Twitter account. Log Out / เปลี่ยนแปลง )

Facebook photo

You are commenting using your Facebook account. Log Out / เปลี่ยนแปลง )

Google+ photo

You are commenting using your Google+ account. Log Out / เปลี่ยนแปลง )

Connecting to %s