guile-mecab/mecab/mecab.scm

mecab.scm

1
(define-module (mecab mecab)
2
  #:use-module (bytestructures guile)
3
  #:use-module (ice-9 match)
4
  #:use-module (mecab configuration)
5
  #:use-module (rnrs bytevectors)
6
  #:use-module ((system foreign) #:select
7
                                 (pointer->procedure pointer->string string->pointer
8
                                  bytevector->pointer pointer->bytevector
9
                                  pointer-address make-pointer
10
                                  sizeof %null-pointer))
11
  #:use-module ((system foreign) #:prefix foreign:)
12
  #:export (mecab-version
13
            mecab-new-tagger
14
            mecab-error
15
            mecab-parse
16
            mecab-parse-to-str
17
            mecab-destroy
18
19
            node-next
20
            node-prev
21
            node-feature
22
            node-surface
23
            node-stat
24
            MECAB_NOR_NODE MECAB_UNK_NODE MECAB_BOS_NODE MECAB_EOS_NODE MECAB_EON_NODE
25
26
            mecab-split
27
            mecab-features
28
            mecab-words))
29
30
(define (make-double-pointer)
31
  (bytevector->pointer (make-bytevector (sizeof '*))))
32
33
(define (mecab->procedure return name params)
34
  (let ((ptr (dynamic-func name (dynamic-link %libmecab))))
35
    (pointer->procedure return ptr params)))
36
37
(define mecab-version
38
  (let ((proc (mecab->procedure '* "mecab_version" '())))
39
    (lambda ()
40
      (pointer->string (proc)))))
41
42
(define mecab-new-tagger
43
  (let ((proc (mecab->procedure '* "mecab_new" `(,foreign:int *))))
44
    (lambda (options)
45
      (let* ((options (map string->utf8 (cons "mecab" options)))
46
             (bv (make-bytevector (* (sizeof '*) (+ (length options) 1)) 0)))
47
        (cond
48
          ((= (sizeof '*) 4)
49
           (let loop ((i 0) (options options))
50
             (match options
51
               (() #t)
52
               ((option options ...)
53
                (bytevector-u32-set! bv (* 4 i) (pointer-address (bytevector->pointer option)) (native-endianness))
54
                (loop (+ i 1) options)))))
55
          ((= (sizeof '*) 8)
56
           (let loop ((i 0) (options options))
57
             (match options
58
               (() #t)
59
               ((option options ...)
60
                (bytevector-u64-set! bv (* 8 i) (pointer-address (bytevector->pointer option)) (native-endianness))
61
                (loop (+ i 1) options)))))
62
          (else (throw 'unsupported-pointer-size (sizeof '*))))
63
        (proc (length options)
64
              (bytevector->pointer bv))))))
65
66
(define mecab-error
67
  (let ((proc (mecab->procedure '* "mecab_strerror" '(*))))
68
    (lambda (tagger)
69
      (pointer->string (proc tagger)))))
70
71
(define %mecab-path
72
  (bs:pointer
73
    (delay (bs:struct
74
             `((rnode ,%mecab-node)
75
               (rnext ,%mecab-path)
76
               (lnode ,%mecab-node)
77
               (lnext ,%mecab-path)
78
               (cost ,int)
79
               (prob ,float))))))
80
81
(define %mecab-node
82
  (bs:pointer
83
    (delay (bs:struct
84
             `((prev ,%mecab-node)
85
               (next ,%mecab-node)
86
               (enext ,%mecab-node)
87
               (bnext ,%mecab-node)
88
               (rpath ,%mecab-path)
89
               (lpath ,%mecab-path)
90
               (surface ,(bs:pointer uint8)); char *, not 0-terminated, use length
91
               (feature ,cstring-pointer)
92
               (id ,unsigned-int)
93
               (length ,unsigned-short)
94
               (rlength ,unsigned-short)
95
               (rcattr ,unsigned-short)
96
               (lcattr ,unsigned-short)
97
               (posid ,unsigned-short)
98
               (chartype ,uint8)
99
               (stat ,uint8)
100
               (isbest ,uint8)
101
               (alpha ,float)
102
               (beta ,float)
103
               (prob ,float)
104
               (wcost ,short)
105
               (cost ,long))))))
106
107
(define mecab-parse
108
  (let ((proc (mecab->procedure '* "mecab_sparse_tonode" '(* *))))
109
    (lambda (tagger str)
110
      (bytestructure-ref
111
        (bytestructure
112
          %mecab-node
113
          (pointer-address (proc tagger (string->pointer str))))
114
        '*))))
115
116
(define mecab-parse-to-str
117
  (let ((proc (mecab->procedure '* "mecab_sparse_tostr" '(* *))))
118
    (lambda (tagger str)
119
      (pointer->string (proc tagger (string->pointer str))))))
120
121
(define mecab-destroy
122
  (let ((proc (mecab->procedure foreign:void "mecab_destroy" '(*))))
123
    (lambda (tagger)
124
      (proc tagger))))
125
126
(define MECAB_NOR_NODE 0)
127
(define MECAB_UNK_NODE 1)
128
(define MECAB_BOS_NODE 2)
129
(define MECAB_EOS_NODE 3)
130
(define MECAB_EON_NODE 4)
131
132
(define (node-feature node)
133
  (bytestructure-ref node 'feature))
134
(define (node-surface node)
135
  (let* ((surface (bytestructure-ref node 'surface))
136
         (len (bytestructure-ref node 'length))
137
         (pointer (make-pointer surface))
138
         (bv (pointer->bytevector pointer len))
139
         (str (make-bytevector (+ (bytevector-length bv) 1))))
140
    (bytevector-copy! bv 0 str 0 (bytevector-length bv))
141
    (bytevector-u8-set! str (bytevector-length bv) 0)
142
    (pointer->string (bytevector->pointer str))))
143
(define (node-next node)
144
  (bytestructure-ref node 'next '*))
145
(define (node-prev node)
146
  (bytestructure-ref node 'prev '*))
147
(define (node-stat node)
148
  (bytestructure-ref node 'stat))
149
150
(define (mecab-split tagger str)
151
  (let ((bos-node (mecab-parse tagger str)))
152
    (let loop ((node (node-next bos-node)) (surfaces '()))
153
      (if (= (node-stat node) MECAB_EOS_NODE)
154
          (reverse surfaces)
155
          (loop (node-next node) (cons (node-surface node) surfaces))))))
156
157
(define (mecab-features tagger str)
158
  (let ((bos-node (mecab-parse tagger str)))
159
    (let loop ((node (node-next bos-node)) (surfaces '()))
160
      (if (= (node-stat node) MECAB_EOS_NODE)
161
          (reverse surfaces)
162
          (loop (node-next node) (cons (node-feature node) surfaces))))))
163
164
(define (mecab-words tagger str)
165
  (let ((bos-node (mecab-parse tagger str)))
166
    (let loop ((node (node-next bos-node)) (words '()))
167
      (cond
168
        ((= (node-stat node) MECAB_EOS_NODE) words)
169
        ((= (node-stat node) MECAB_NOR_NODE)
170
         ;; 動詞,自立,*,*,五段・ラ行,連用タ接続,当たる,アタッ,アタッ
171
         (match (string-split (node-feature node) #\,)
172
           ((gram1 gram2 gram3 gram4 gram5 gram6 dictionary read1 read2)
173
            (match gram1
174
              ("助動詞" (loop (node-next node) words))
175
              ("記号" (loop (node-next node) words))
176
              (_ (loop (node-next node) (cons dictionary words)))))))
177
        (else (loop (node-next node) words))))))
178