This is a GO implementation of MMSEG which a Chinese word splitting algorithm.
- Documentation/comments
- Benchmark
#Input Dictionary Format
Key\tFreq
Each key occupies one line. The file should be utf-8 encoded, please refer to go-darts
#Code example
package main
import (
"fmt"
"time"
"os"
"mmsego"
"bufio"
"log"
)
func main() {
var s = new(mmsego.Segmenter)
s.Init("darts.lib")
if err != nil {
log.Fatal(err)
}
t := time.Now()
offset := 0
unifile, _ := os.Open("/tmp/a.txt")
uniLineReader := bufio.NewReaderSize(unifile, 4000)
line, bufErr := uniLineReader.ReadString('\n')
for nil == bufErr {
//takeWord := func(off int, length int){ fmt.Printf("%s ", string(line[off-offset:off-offset+length])) }
takeWord := func(off, length int){ }
s.Mmseg(line[:], offset, takeWord, nil, false)
offset += len(line)
line, bufErr = uniLineReader.ReadString('\n')
}
takeWord := func(off int, length int){ fmt.Printf("%s ", string(line[off-offset:off-offset+length])) }
s.Mmseg(line, offset, takeWord, nil, true)
fmt.Printf("Duration: %v\n", time.Since(t))
}
Apache License 2.0