(define-module (mecab mecab) #:use-module (bytestructures guile) #:use-module (ice-9 match) #:use-module (mecab configuration) #:use-module (rnrs bytevectors) #:use-module ((system foreign) #:select (pointer->procedure pointer->string string->pointer bytevector->pointer pointer->bytevector pointer-address make-pointer sizeof %null-pointer)) #:use-module ((system foreign) #:prefix foreign:) #:export (mecab-version mecab-new-tagger mecab-error mecab-parse mecab-parse-to-str mecab-destroy node-next node-prev node-feature node-surface node-stat MECAB_NOR_NODE MECAB_UNK_NODE MECAB_BOS_NODE MECAB_EOS_NODE MECAB_EON_NODE mecab-split mecab-features mecab-words)) (define (make-double-pointer) (bytevector->pointer (make-bytevector (sizeof '*)))) (define (mecab->procedure return name params) (let ((ptr (dynamic-func name (dynamic-link %libmecab)))) (pointer->procedure return ptr params))) (define mecab-version (let ((proc (mecab->procedure '* "mecab_version" '()))) (lambda () (pointer->string (proc))))) (define mecab-new-tagger (let ((proc (mecab->procedure '* "mecab_new" `(,foreign:int *)))) (lambda (options) (let* ((options (map string->utf8 (cons "mecab" options))) (bv (make-bytevector (* (sizeof '*) (+ (length options) 1)) 0))) (cond ((= (sizeof '*) 4) (let loop ((i 0) (options options)) (match options (() #t) ((option options ...) (bytevector-u32-set! bv (* 4 i) (pointer-address (bytevector->pointer option)) (native-endianness)) (loop (+ i 1) options))))) ((= (sizeof '*) 8) (let loop ((i 0) (options options)) (match options (() #t) ((option options ...) (bytevector-u64-set! bv (* 8 i) (pointer-address (bytevector->pointer option)) (native-endianness)) (loop (+ i 1) options))))) (else (throw 'unsupported-pointer-size (sizeof '*)))) (proc (length options) (bytevector->pointer bv)))))) (define mecab-error (let ((proc (mecab->procedure '* "mecab_strerror" '(*)))) (lambda (tagger) (pointer->string (proc tagger))))) (define %mecab-path (bs:pointer (delay (bs:struct `((rnode ,%mecab-node) (rnext ,%mecab-path) (lnode ,%mecab-node) (lnext ,%mecab-path) (cost ,int) (prob ,float)))))) (define %mecab-node (bs:pointer (delay (bs:struct `((prev ,%mecab-node) (next ,%mecab-node) (enext ,%mecab-node) (bnext ,%mecab-node) (rpath ,%mecab-path) (lpath ,%mecab-path) (surface ,(bs:pointer uint8)); char *, not 0-terminated, use length (feature ,cstring-pointer) (id ,unsigned-int) (length ,unsigned-short) (rlength ,unsigned-short) (rcattr ,unsigned-short) (lcattr ,unsigned-short) (posid ,unsigned-short) (chartype ,uint8) (stat ,uint8) (isbest ,uint8) (alpha ,float) (beta ,float) (prob ,float) (wcost ,short) (cost ,long)))))) (define mecab-parse (let ((proc (mecab->procedure '* "mecab_sparse_tonode" '(* *)))) (lambda (tagger str) (bytestructure-ref (bytestructure %mecab-node (pointer-address (proc tagger (string->pointer str)))) '*)))) (define mecab-parse-to-str (let ((proc (mecab->procedure '* "mecab_sparse_tostr" '(* *)))) (lambda (tagger str) (pointer->string (proc tagger (string->pointer str)))))) (define mecab-destroy (let ((proc (mecab->procedure foreign:void "mecab_destroy" '(*)))) (lambda (tagger) (proc tagger)))) (define MECAB_NOR_NODE 0) (define MECAB_UNK_NODE 1) (define MECAB_BOS_NODE 2) (define MECAB_EOS_NODE 3) (define MECAB_EON_NODE 4) (define (node-feature node) (bytestructure-ref node 'feature)) (define (node-surface node) (let* ((surface (bytestructure-ref node 'surface)) (len (bytestructure-ref node 'length)) (pointer (make-pointer surface)) (bv (pointer->bytevector pointer len)) (str (make-bytevector (+ (bytevector-length bv) 1)))) (bytevector-copy! bv 0 str 0 (bytevector-length bv)) (bytevector-u8-set! str (bytevector-length bv) 0) (pointer->string (bytevector->pointer str)))) (define (node-next node) (bytestructure-ref node 'next '*)) (define (node-prev node) (bytestructure-ref node 'prev '*)) (define (node-stat node) (bytestructure-ref node 'stat)) (define (mecab-split tagger str) (let ((bos-node (mecab-parse tagger str))) (let loop ((node (node-next bos-node)) (surfaces '())) (if (= (node-stat node) MECAB_EOS_NODE) (reverse surfaces) (loop (node-next node) (cons (node-surface node) surfaces)))))) (define (mecab-features tagger str) (let ((bos-node (mecab-parse tagger str))) (let loop ((node (node-next bos-node)) (surfaces '())) (if (= (node-stat node) MECAB_EOS_NODE) (reverse surfaces) (loop (node-next node) (cons (node-feature node) surfaces)))))) (define (mecab-words tagger str) (let ((bos-node (mecab-parse tagger str))) (let loop ((node (node-next bos-node)) (words '())) (cond ((= (node-stat node) MECAB_EOS_NODE) words) ((= (node-stat node) MECAB_NOR_NODE) ;; 動詞,自立,*,*,五段・ラ行,連用タ接続,当たる,アタッ,アタッ (match (string-split (node-feature node) #\,) ((gram1 gram2 gram3 gram4 gram5 gram6 dictionary read1 read2) (match gram1 ("助動詞" (loop (node-next node) words)) ("記号" (loop (node-next node) words)) (_ (loop (node-next node) (cons dictionary words))))))) (else (loop (node-next node) words))))))