Add radk generation

Julien LepillerSat Apr 25 17:11:59+0200 2020

8aedee5

Add radk generation

Makefile

1414
1515
DICO_MODULES=modules/nani/trie.scm modules/nani/result.scm modules/nani/jmdict/trie.scm \
1616
        modules/nani/jmdict/serialize.scm modules/nani/jmdict/xml.scm \
17-
        modules/nani/jmdict/entities.scm modules/nani/frequency.scm
17+
        modules/nani/jmdict/entities.scm modules/nani/frequency.scm \
18+
        modules/nani/serialize.scm
1819
1920
LANGS=fr
2021

7071
7172
po/nani.pot: $(HAUNT_FILES)
7273
	xgettext --keyword=_ --language=scheme --add-comments --sort-output --from-code UTF-8 -o $@ $^
74+
75+
include radicals.mk

modules/nani/jmdict/serialize.scm

1919
(define-module (nani jmdict serialize)
2020
  #:use-module (nani huffman)
2121
  #:use-module (nani result)
22+
  #:use-module (nani serialize)
2223
  #:use-module (nani trie)
2324
  #:use-module (rnrs bytevectors)
2425
  #:export (serialize-jmdict))
2526
26-
(define (merge-bvs bvs)
27-
  (let* ((size (apply + (map bytevector-length bvs)))
28-
         (bv (make-bytevector size 0)))
29-
    (let loop ((bvs bvs) (pos 0))
30-
      (unless (null? bvs)
31-
        (let ((sz (bytevector-length (car bvs))))
32-
          (bytevector-copy! (car bvs) 0 bv pos sz)
33-
          (loop (cdr bvs) (+ pos sz)))))
34-
    bv))
35-
3627
(define (serialize-trie trie results pos)
3728
  (define (serialize-trie-aux transitions pos)
3829
    (let loop ((pos pos)

6354
                      (+ vals-sz 1) bv #:size? #f)
6455
      (cons next-pos (cons bv bvs)))))
6556
66-
(define* (serialize-list lst serialize pos bv #:key (size? #t))
67-
  (when (not (list? lst)) (throw 'not-list lst))
68-
  (when size?
69-
    (bytevector-u16-set! bv pos (length lst) (endianness big)))
70-
  (let loop ((lst lst) (pos (+ pos (if size? 2 0))))
71-
    (if (null? lst)
72-
      pos
73-
      (loop (cdr lst) (serialize (car lst) pos bv)))))
74-
(define* (list-size lst size #:key (size? #t))
75-
  (when (not (list? lst)) (throw 'not-list lst))
76-
  (apply + (if size? 2 0) (map size lst)))
77-
78-
(define (serialize-pointer ptr pos bv)
79-
  (bytevector-u8-set! bv pos (car ptr))
80-
  (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big))
81-
  (+ pos 5))
82-
83-
(define (serialize-char int pos bv)
84-
  (bytevector-u8-set! bv pos int)
85-
  (+ pos 1))
86-
(define char-size (const 1))
87-
88-
(define (serialize-int int pos bv)
89-
  (bytevector-u32-set! bv pos int (endianness big))
90-
  (+ pos 4))
91-
(define int-size (const 4))
92-
93-
(define (serialize-boolean bool pos bv)
94-
  (bytevector-u8-set! bv pos (if bool 1 0))
95-
  (+ pos 1))
96-
(define boolean-size (const 1))
97-
9857
(define (serialize-jmdict results kanji-trie reading-trie sense-trie)
9958
  (define kanji-huffman
10059
    (let ((kanjis (apply append (map result-kanjis results))))

11170
      (create-huffman glosses)))
11271
  (define sense-huffman-code (huffman->code sense-huffman))
11372
114-
  (define (serialize-string str pos bv)
115-
    (let ((sbv (string->utf8 str)))
116-
      (bytevector-copy! sbv 0 bv pos (bytevector-length sbv))
117-
      (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0)
118-
      (+ pos 1 (bytevector-length sbv))))
119-
  (define (string-size str)
120-
    (let ((sbv (string->utf8 str)))
121-
      (+ 1 (bytevector-length sbv))))
122-
12373
  (define (serialize-huffman-string huffman-code)
12474
    (lambda (str pos bv)
12575
      (let ((sbv (huffman-encode huffman-code str)))

modules/nani/radk.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani radk)
20+
  #:use-module (ice-9 match)
21+
  #:use-module (ice-9 peg)
22+
  #:use-module (ice-9 rdelim)
23+
  #:use-module (nani serialize)
24+
  #:use-module (rnrs bytevectors)
25+
  #:use-module (sxml simple)
26+
  #:export (parse-radk
27+
            get-kanji-stroke
28+
            get-rad-kanji
29+
            get-rad-stroke
30+
            serialize-radk))
31+
32+
(define-peg-pattern comment none (and "#" (* (or "\t" (range #\x20 #\x10ffff))) "\n"))
33+
(define-peg-pattern space none " ")
34+
(define-peg-pattern return none "\n")
35+
(define-peg-pattern entry all
36+
  (and (ignore "$") space char space num (? (and space name)) (ignore "\n")
37+
       (+ (or char (ignore "\n")))))
38+
(define-peg-pattern num all (+ (or (range #\0 #\9))))
39+
(define-peg-pattern name none (+ (or (range #\0 #\9) (range #\a #\z) (range #\A #\Z))))
40+
(define-peg-pattern char all (and (range #\xff #\x10ffff)))
41+
(define-peg-pattern radk-doc body (* (or return comment entry)))
42+
43+
(define (parse-radk file)
44+
  (peg:tree (match-pattern radk-doc (call-with-input-file file read-string))))
45+
46+
(define (get-rad-kanji content)
47+
  (let loop ((result '()) (content content))
48+
    (match content
49+
      (() result)
50+
      ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...)
51+
       (loop (cons (cons radical kanji) result)
52+
             content)))))
53+
54+
(define (get-rad-stroke content)
55+
  (let loop ((result '()) (content content))
56+
    (match content
57+
      (() result)
58+
      ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...)
59+
       (loop (cons (cons radical (string->number stroke)) result)
60+
             content)))))
61+
62+
(define (get-kanji-stroke file)
63+
  (define strokes (xml->sxml (call-with-input-file file read-string)))
64+
65+
  (match strokes
66+
    (('*TOP* _ ('kanjidic2 content ...))
67+
     (map
68+
       (lambda (entry)
69+
         (let* ((literal (car (assoc-ref entry 'literal)))
70+
                (misc (assoc-ref entry 'misc))
71+
                (misc (filter list? misc))
72+
                (stroke (car (assoc-ref misc 'stroke_count))))
73+
           (cons literal (string->number stroke))))
74+
       (filter
75+
         (lambda (entry)
76+
           (and
77+
             (list? entry)
78+
             (equal? (car entry) 'character)))
79+
         content)))))
80+
81+
(define (serialize-radk rad-kanji rad-stroke kanji-stroke)
82+
  (define (serialize-rad-kanji-element element pos bv)
83+
    (match element
84+
      ((radical kanji ...)
85+
       (let* ((pos (serialize-string radical pos bv)))
86+
         (serialize-list kanji serialize-string pos bv)))))
87+
  (define (rad-kanji-element-size element)
88+
    (match element
89+
      ((radical kanji ...)
90+
       (+ (string-size radical) (list-size kanji string-size)))))
91+
92+
  (define (serialize-rad-kanji rad-kanji pos bv)
93+
    (serialize-list rad-kanji serialize-rad-kanji-element pos bv))
94+
  (define (rad-kanji-size rad-kanji)
95+
    (list-size rad-kanji rad-kanji-element-size))
96+
97+
  (define (serialize-rad-stroke-element element pos bv)
98+
    (match element
99+
      ((radical . stroke)
100+
       (let ((pos (serialize-string radical pos bv)))
101+
         (serialize-char stroke pos bv)))))
102+
  (define (rad-stroke-element-size element)
103+
    (match element
104+
      ((radical . stroke)
105+
       (+ (string-size radical) (char-size stroke)))))
106+
107+
  (define (serialize-rad-stroke rad-stroke pos bv)
108+
    (serialize-list rad-stroke serialize-rad-stroke-element pos bv))
109+
  (define (rad-stroke-size rad-stroke)
110+
    (list-size rad-stroke rad-stroke-element-size))
111+
112+
  (define serialize-kanji-stroke serialize-rad-stroke)
113+
  (define kanji-stroke-size rad-stroke-size)
114+
115+
  (let* ((header (string->utf8 "NANI_RADK001"))
116+
         (header-size (bytevector-length header))
117+
         (bv (make-bytevector (+ header-size 12
118+
                                 (rad-kanji-size rad-kanji)
119+
                                 (rad-stroke-size rad-stroke)
120+
                                 (kanji-stroke-size kanji-stroke)))))
121+
    (bytevector-copy! header 0 bv 0 header-size)
122+
    (let* ((pos header-size)
123+
           (pos (serialize-rad-kanji rad-kanji pos bv))
124+
           (pos (serialize-rad-stroke rad-stroke pos bv))
125+
           (pos (serialize-kanji-stroke kanji-stroke pos bv)))
126+
      bv)))
127+
128+
129+
130+
131+

modules/nani/serialize.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani serialize)
20+
  #:use-module (rnrs bytevectors)
21+
  #:export (merge-bvs
22+
            serialize-list list-size
23+
            serialize-pointer
24+
            serialize-char char-size
25+
            serialize-int int-size
26+
            serialize-boolean boolean-size
27+
            serialize-string string-size))
28+
29+
(define (merge-bvs bvs)
30+
  (let* ((size (apply + (map bytevector-length bvs)))
31+
         (bv (make-bytevector size 0)))
32+
    (let loop ((bvs bvs) (pos 0))
33+
      (unless (null? bvs)
34+
        (let ((sz (bytevector-length (car bvs))))
35+
          (bytevector-copy! (car bvs) 0 bv pos sz)
36+
          (loop (cdr bvs) (+ pos sz)))))
37+
    bv))
38+
39+
(define* (serialize-list lst serialize pos bv #:key (size? #t))
40+
  (when (not (list? lst)) (throw 'not-list lst))
41+
  (when size?
42+
    (bytevector-u16-set! bv pos (length lst) (endianness big)))
43+
  (let loop ((lst lst) (pos (+ pos (if size? 2 0))))
44+
    (if (null? lst)
45+
      pos
46+
      (loop (cdr lst) (serialize (car lst) pos bv)))))
47+
(define* (list-size lst size #:key (size? #t))
48+
  (when (not (list? lst)) (throw 'not-list lst))
49+
  (apply + (if size? 2 0) (map size lst)))
50+
51+
(define (serialize-pointer ptr pos bv)
52+
  (bytevector-u8-set! bv pos (car ptr))
53+
  (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big))
54+
  (+ pos 5))
55+
56+
(define (serialize-char int pos bv)
57+
  (bytevector-u8-set! bv pos int)
58+
  (+ pos 1))
59+
(define char-size (const 1))
60+
61+
(define (serialize-int int pos bv)
62+
  (bytevector-u32-set! bv pos int (endianness big))
63+
  (+ pos 4))
64+
(define int-size (const 4))
65+
66+
(define (serialize-boolean bool pos bv)
67+
  (bytevector-u8-set! bv pos (if bool 1 0))
68+
  (+ pos 1))
69+
(define boolean-size (const 1))
70+
71+
(define (serialize-string str pos bv)
72+
  (let ((sbv (string->utf8 str)))
73+
    (bytevector-copy! sbv 0 bv pos (bytevector-length sbv))
74+
    (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0)
75+
    (+ pos 1 (bytevector-length sbv))))
76+
(define (string-size str)
77+
  (let ((sbv (string->utf8 str)))
78+
    (+ 1 (bytevector-length sbv))))

radicals.mk unknown status 1

1+
RADK_MODULES=tools/radk.scm modules/nani/radk.scm modules/nani/serialize.scm
2+
3+
dictionaries/kradzip.zip:
4+
	wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O $@
5+
6+
dictionaries/radkfilex: dictionaries/kradzip.zip
7+
	unzip $^ $$(basename $@) -d $$(dirname $@)
8+
	touch $@
9+
10+
dictionaries/radkfilex.utf8: dictionaries/radkfilex
11+
	iconv -f euc-jp -t utf-8 $^ > $@
12+
13+
dictionaries/kanjidic2.xml.gz:
14+
	wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@
15+
16+
dictionaries/kanjidic2.xml: dictionaries/kanjidic2.xml.gz
17+
	gunzip $^
18+
19+
dicos/radicals.nani: tools/radk.scm dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $(RADK_MODULES)
20+
	guile -L modules tools/radk.scm build dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $@

tools/jmdict.scm

6363
        (put-bytevector port
6464
          (serialize-jmdict results kanji-trie reading-trie meaning-trie))))))
6565
66+
(define (print word dict)
67+
  ())
68+
6669
(match (command-line)
6770
  ((_ cmd input lang output)
6871
   (cond

7275
       (compile input (lambda (sense) (equal? (sense-language sense) lang)) output)))
7376
    ((equal? cmd "convert")
7477
     (convert input output))
75-
    (else (format #t "Unknown cmd ~a.~%" cmd)))))
78+
    (else (format #t "Unknown cmd ~a.~%" cmd))))
79+
  ((_ "print" word input)
80+
   (print word input)))

tools/radk.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(use-modules (nani radk))
20+
(use-modules (ice-9 match))
21+
(use-modules (ice-9 binary-ports))
22+
23+
(match (command-line)
24+
  ((_ cmd radk-file kanjidic-file output)
25+
   (cond
26+
    ((equal? cmd "build")
27+
     (let* ((radk (parse-radk radk-file))
28+
            (rad-stroke (get-rad-stroke radk))
29+
            (rad-kanji (get-rad-kanji radk))
30+
            (kanji-stroke (get-kanji-stroke kanjidic-file)))
31+
       (call-with-output-file output
32+
         (lambda (port)
33+
           (put-bytevector port
34+
             (serialize-radk rad-kanji rad-stroke kanji-stroke))))))
35+
    (else (format #t "Unknown cmd ~a.~%" cmd)))))