mecab.scm
1 | (define-module (mecab mecab) |
2 | #:use-module (bytestructures guile) |
3 | #:use-module (ice-9 match) |
4 | #:use-module (mecab configuration) |
5 | #:use-module (rnrs bytevectors) |
6 | #:use-module ((system foreign) #:select |
7 | (pointer->procedure pointer->string string->pointer |
8 | bytevector->pointer pointer->bytevector |
9 | pointer-address make-pointer |
10 | sizeof %null-pointer)) |
11 | #:use-module ((system foreign) #:prefix foreign:) |
12 | #:export (mecab-version |
13 | mecab-new-tagger |
14 | mecab-error |
15 | mecab-parse |
16 | mecab-parse-to-str |
17 | mecab-destroy |
18 | |
19 | node-next |
20 | node-prev |
21 | node-feature |
22 | node-surface |
23 | node-stat |
24 | MECAB_NOR_NODE MECAB_UNK_NODE MECAB_BOS_NODE MECAB_EOS_NODE MECAB_EON_NODE |
25 | |
26 | mecab-split |
27 | mecab-features |
28 | mecab-words)) |
29 | |
30 | (define (make-double-pointer) |
31 | (bytevector->pointer (make-bytevector (sizeof '*)))) |
32 | |
33 | (define (mecab->procedure return name params) |
34 | (let ((ptr (dynamic-func name (dynamic-link %libmecab)))) |
35 | (pointer->procedure return ptr params))) |
36 | |
37 | (define mecab-version |
38 | (let ((proc (mecab->procedure '* "mecab_version" '()))) |
39 | (lambda () |
40 | (pointer->string (proc))))) |
41 | |
42 | (define mecab-new-tagger |
43 | (let ((proc (mecab->procedure '* "mecab_new" `(,foreign:int *)))) |
44 | (lambda (options) |
45 | (let* ((options (map string->utf8 (cons "mecab" options))) |
46 | (bv (make-bytevector (* (sizeof '*) (+ (length options) 1)) 0))) |
47 | (cond |
48 | ((= (sizeof '*) 4) |
49 | (let loop ((i 0) (options options)) |
50 | (match options |
51 | (() #t) |
52 | ((option options ...) |
53 | (bytevector-u32-set! bv (* 4 i) (pointer-address (bytevector->pointer option)) (native-endianness)) |
54 | (loop (+ i 1) options))))) |
55 | ((= (sizeof '*) 8) |
56 | (let loop ((i 0) (options options)) |
57 | (match options |
58 | (() #t) |
59 | ((option options ...) |
60 | (bytevector-u64-set! bv (* 8 i) (pointer-address (bytevector->pointer option)) (native-endianness)) |
61 | (loop (+ i 1) options))))) |
62 | (else (throw 'unsupported-pointer-size (sizeof '*)))) |
63 | (proc (length options) |
64 | (bytevector->pointer bv)))))) |
65 | |
66 | (define mecab-error |
67 | (let ((proc (mecab->procedure '* "mecab_strerror" '(*)))) |
68 | (lambda (tagger) |
69 | (pointer->string (proc tagger))))) |
70 | |
71 | (define %mecab-path |
72 | (bs:pointer |
73 | (delay (bs:struct |
74 | `((rnode ,%mecab-node) |
75 | (rnext ,%mecab-path) |
76 | (lnode ,%mecab-node) |
77 | (lnext ,%mecab-path) |
78 | (cost ,int) |
79 | (prob ,float)))))) |
80 | |
81 | (define %mecab-node |
82 | (bs:pointer |
83 | (delay (bs:struct |
84 | `((prev ,%mecab-node) |
85 | (next ,%mecab-node) |
86 | (enext ,%mecab-node) |
87 | (bnext ,%mecab-node) |
88 | (rpath ,%mecab-path) |
89 | (lpath ,%mecab-path) |
90 | (surface ,(bs:pointer uint8)); char *, not 0-terminated, use length |
91 | (feature ,cstring-pointer) |
92 | (id ,unsigned-int) |
93 | (length ,unsigned-short) |
94 | (rlength ,unsigned-short) |
95 | (rcattr ,unsigned-short) |
96 | (lcattr ,unsigned-short) |
97 | (posid ,unsigned-short) |
98 | (chartype ,uint8) |
99 | (stat ,uint8) |
100 | (isbest ,uint8) |
101 | (alpha ,float) |
102 | (beta ,float) |
103 | (prob ,float) |
104 | (wcost ,short) |
105 | (cost ,long)))))) |
106 | |
107 | (define mecab-parse |
108 | (let ((proc (mecab->procedure '* "mecab_sparse_tonode" '(* *)))) |
109 | (lambda (tagger str) |
110 | (bytestructure-ref |
111 | (bytestructure |
112 | %mecab-node |
113 | (pointer-address (proc tagger (string->pointer str)))) |
114 | '*)))) |
115 | |
116 | (define mecab-parse-to-str |
117 | (let ((proc (mecab->procedure '* "mecab_sparse_tostr" '(* *)))) |
118 | (lambda (tagger str) |
119 | (pointer->string (proc tagger (string->pointer str)))))) |
120 | |
121 | (define mecab-destroy |
122 | (let ((proc (mecab->procedure foreign:void "mecab_destroy" '(*)))) |
123 | (lambda (tagger) |
124 | (proc tagger)))) |
125 | |
126 | (define MECAB_NOR_NODE 0) |
127 | (define MECAB_UNK_NODE 1) |
128 | (define MECAB_BOS_NODE 2) |
129 | (define MECAB_EOS_NODE 3) |
130 | (define MECAB_EON_NODE 4) |
131 | |
132 | (define (node-feature node) |
133 | (bytestructure-ref node 'feature)) |
134 | (define (node-surface node) |
135 | (let* ((surface (bytestructure-ref node 'surface)) |
136 | (len (bytestructure-ref node 'length)) |
137 | (pointer (make-pointer surface)) |
138 | (bv (pointer->bytevector pointer len)) |
139 | (str (make-bytevector (+ (bytevector-length bv) 1)))) |
140 | (bytevector-copy! bv 0 str 0 (bytevector-length bv)) |
141 | (bytevector-u8-set! str (bytevector-length bv) 0) |
142 | (pointer->string (bytevector->pointer str)))) |
143 | (define (node-next node) |
144 | (bytestructure-ref node 'next '*)) |
145 | (define (node-prev node) |
146 | (bytestructure-ref node 'prev '*)) |
147 | (define (node-stat node) |
148 | (bytestructure-ref node 'stat)) |
149 | |
150 | (define (mecab-split tagger str) |
151 | (let ((bos-node (mecab-parse tagger str))) |
152 | (let loop ((node (node-next bos-node)) (surfaces '())) |
153 | (if (= (node-stat node) MECAB_EOS_NODE) |
154 | (reverse surfaces) |
155 | (loop (node-next node) (cons (node-surface node) surfaces)))))) |
156 | |
157 | (define (mecab-features tagger str) |
158 | (let ((bos-node (mecab-parse tagger str))) |
159 | (let loop ((node (node-next bos-node)) (surfaces '())) |
160 | (if (= (node-stat node) MECAB_EOS_NODE) |
161 | (reverse surfaces) |
162 | (loop (node-next node) (cons (node-feature node) surfaces)))))) |
163 | |
164 | (define (mecab-words tagger str) |
165 | (let ((bos-node (mecab-parse tagger str))) |
166 | (let loop ((node (node-next bos-node)) (words '())) |
167 | (cond |
168 | ((= (node-stat node) MECAB_EOS_NODE) words) |
169 | ((= (node-stat node) MECAB_NOR_NODE) |
170 | ;; 動詞,自立,*,*,五段・ラ行,連用タ接続,当たる,アタッ,アタッ |
171 | (match (string-split (node-feature node) #\,) |
172 | ((gram1 gram2 gram3 gram4 gram5 gram6 dictionary read1 read2) |
173 | (match gram1 |
174 | ("助動詞" (loop (node-next node) words)) |
175 | ("記号" (loop (node-next node) words)) |
176 | (_ (loop (node-next node) (cons dictionary words))))))) |
177 | (else (loop (node-next node) words)))))) |
178 |