Improve kanjidic and jmdict formats

Julien LepillerTue Aug 03 14:23:35+0200 2021

ea951ee

Improve kanjidic and jmdict formats

modules/nani/kanji/kanjidic.scm

132132
133133
(define (update-trie-pos! trie kanji)
134134
  (let* ((vals (trie-vals trie))
135-
         (vals (map (lambda (i) (kanji-position (array-ref kanji i))) vals)))
135+
         (vals (match vals
136+
                 ((pos) (kanji-position (array-ref kanji pos)))
137+
                 (() 0))))
136138
    (trie-vals-set! trie vals))
137139
  (for-each
138140
    (match-lambda

173175
         (sense-huffman-size (bytevector-length sense-huffman-bv))
174176
         (reading-huffman-bv (serialize-huffman reading-huffman))
175177
         (reading-huffman-size (bytevector-length reading-huffman-bv))
176-
         (serialize-trie (serialize-trie serialize-int int-size))
177-
         (trie-size (trie-size int-size))
178+
         (serialize-trie (serialize-trie-single serialize-int int-size))
179+
         (trie-size (trie-size-single int-size))
178180
         (kanji-trie (make-kanji-trie kanji))
179181
         (kanji-trie-size (trie-size kanji-trie))
180182
         (results-size

modules/nani/pitch/pitch.scm

5858
      pitches)
5959
    (compress-trie trie)))
6060
61+
(define (uniq lst)
62+
  (let loop ((lst lst) (res '()))
63+
    (match lst
64+
      (() res)
65+
      ((elem lst ...)
66+
       (if (member elem res)
67+
           (loop lst res)
68+
           (loop lst (cons elem res)))))))
69+
6170
(define (collapse-vals! trie)
6271
  (let ((transitions (trie-transitions trie))
6372
        (vals (map (match-lambda ((? string? s) s) ((? number? i) (number->string i)))
64-
                   (trie-vals trie))))
73+
                   (uniq (trie-vals trie)))))
6574
    (trie-vals-set! trie (string-join vals ", "))
6675
    (for-each collapse-vals! (map cdr transitions))))
6776

modules/nani/result/jibiki.scm

293293
        ((equal? elem-gi 'volume)
294294
         results)
295295
        ((equal? elem-gi 'article)
296-
         (set! results (cons seed results))
296+
         (let ((entry (sxml->result seed frq)))
297+
           (set! results (cons entry results)))
297298
         #f)
298299
        (else
299300
         (let* ((seed (reverse seed))

modules/nani/result/jmdict.scm

426426
        ((equal? elem-gi 'JMdict)
427427
         results)
428428
        ((equal? elem-gi 'entry)
429-
         (set! results (cons seed results))
429+
         (let ((entry (sxml->result seed frq)))
430+
           (set! results (cons entry results)))
430431
         #f)
431432
        (else
432433
         (let* ((seed (reverse seed))

modules/nani/result/wadoku.scm

351351
       (sub-loop loop infos result lst l)))))
352352
353353
(define (sxml->element lst elem frq)
354-
  (let ((elem (match elem
355-
                ((_ . elem) elem)
356-
                (_ elem))))
357-
    (match elem
358-
      ('orth (let ((kanji (filter string? lst)))
359-
               (if (null? kanji)
360-
                   #f
361-
                   `(kanji
362-
                      ,(string-filter
363-
                         (lambda (c)
364-
                           ;; Remove characters that interfere with kanji
365-
                           ;; based search
366-
                           (not (member c '(#\??? #\??? #\??? #\??? #\??? #\??? #\???
367-
                                            #\?? #\???))))
368-
                         (car kanji))))))
369-
      ('count #f)
370-
      ('entry (sxml->result lst frq))
371-
      ('hira `(reading ,(car lst)))
372-
      ('hatsuon `(hatsuon ,(car lst)))
373-
      ('accent `(pitch ,(car lst)))
374-
      ('reading (sxml->reading lst))
375-
      ('form lst)
376-
      ('impli `(impli ,(car lst)))
377-
      ('text (let loop ((text "") (lst lst))
378-
               (match lst
379-
                 (() text)
380-
                 ((('hasPrecedingSpace . _) lst ...)
381-
                  (loop (string-append " " text) lst))
382-
                 ((('hasFollowingSpace . _) lst ...)
383-
                  (string-append (loop text lst) " "))
384-
                 (((? string? s) lst ...)
385-
                  (loop (string-append text s) lst)))))
386-
      ('famn (sxml->string lst))
387-
      ('expl (sxml->string lst))
388-
      ('expli (filter list? lst))
389-
      ('abbrev (filter list? lst))
390-
      ('token (filter string? lst))
391-
      ('tr (sxml->string lst))
392-
      ('transcr `(transcr ,lst))
393-
      ('trans
394-
        (append (filter list? lst)
395-
                (map (lambda (s) `(trans ,s)) (filter string? lst))))
396-
      ('jap `(jap ,lst))
397-
      ('emph lst)
398-
      ('title (filter string? lst))
399-
      ('transl (filter string? lst))
400-
      ('topic (filter string? lst))
401-
      ('iron (filter string? lst))
402-
      ('specchar (filter string? lst))
403-
      ('scientif (filter string? lst))
404-
      ('wikide #f)
405-
      ('wikija #f)
406-
      ('link #f)
407-
      ('ref (sxml->ref lst))
408-
      ('sref (sxml->ref lst))
409-
      ('etym (sxml->source lst))
410-
      ('literal (sxml->string (list "???" (sxml->string lst) "???")))
411-
      ('def (sxml->string (list "(" (sxml->string lst) ")")))
412-
      ('date (sxml->string (list "(" (sxml->string lst) ")")))
413-
      ('birthdeath (sxml->string (list "(" (sxml->string lst) ")")))
414-
      ('descr (sxml->string (list "(" (sxml->string lst) ")")))
415-
      ('bracket (sxml->string (list "[" (sxml->string lst) "]")))
416-
      ('foreign (if (null? lst) #f `(foreign ,(car lst))))
417-
      ('seasonword `(info ,(string-append "season: " (assoc-ref lst 'type))))
418-
      ('usg `(infos . ,(usg->infos lst)))
419-
      ('sense (sxml->meaning lst))
420-
      ('steinhaus (let ((ref (sxml->string lst)))
421-
                    (if (string? ref)
422-
                        `(ref ,(sxml->string lst))
423-
                        (throw 'not-steinhaus ref))))
424-
      ('pos '()); TODO: actually find what info to use
425-
      ('wordcomponent (cons 'wordcomponent lst))
426-
      ('meishi (cons 'meishi lst))
427-
      ('setsuzokushi (cons 'setsuzokushi lst))
428-
      ('daimeishi (cons 'daimeishi lst))
429-
      ('doushi (cons 'doushi lst))
430-
      ('kandoushi (cons 'kandoushi lst))
431-
      ('keiyoudoushi (cons 'keiyoudoushi lst))
432-
      ('keiyoushi (cons 'keiyoushi lst))
433-
      ('fukushi (cons 'fukushi lst))
434-
      ('rengo (cons 'rengo lst))
435-
      ('suffix (cons 'suffix lst))
436-
      ('prefix (cons 'prefix lst))
437-
      ('kanji (cons 'kanji lst))
438-
      ('rentaishi (cons 'rentaishi lst))
439-
      ('specialcharacter (cons 'specialcharacter lst))
440-
      ('joshi (cons 'joshi lst))
441-
      ('fukujoshi (cons 'fukujoshi lst))
442-
      ('kakujoshi (cons 'kakujoshi lst))
443-
      ('kakarijoshi (cons 'kakarijoshi lst))
444-
      ('shuujoshi (cons 'shuujoshi lst))
445-
      ('setsuzokujoshi (cons 'setsuzokujoshi lst))
446-
      ('jokeiyoushi (cons 'jokeiyoushi lst))
447-
      ('jodoushi (cons 'jodoushi lst))
448-
      ('ruigos lst)
449-
      ('ruigo (ruigo->ref lst))
450-
      ('gramGrp (gram->info lst)))))
354+
  (match elem
355+
    ('orth (let ((kanji (filter string? lst)))
356+
             (if (null? kanji)
357+
                 #f
358+
                 `(kanji
359+
                    ,(string-filter
360+
                       (lambda (c)
361+
                         ;; Remove characters that interfere with kanji
362+
                         ;; based search
363+
                         (not (member c '(#\??? #\??? #\??? #\??? #\??? #\??? #\???
364+
                                          #\?? #\???))))
365+
                       (car kanji))))))
366+
    ('count #f)
367+
    ('entry (sxml->result lst frq))
368+
    ('hira `(reading ,(car lst)))
369+
    ('hatsuon `(hatsuon ,(car lst)))
370+
    ('accent `(pitch ,(car lst)))
371+
    ('reading (sxml->reading lst))
372+
    ('form lst)
373+
    ('impli `(impli ,(car lst)))
374+
    ('text (let loop ((text "") (lst lst))
375+
             (match lst
376+
               (() text)
377+
               ((('hasPrecedingSpace . _) lst ...)
378+
                (loop (string-append " " text) lst))
379+
               ((('hasFollowingSpace . _) lst ...)
380+
                (string-append (loop text lst) " "))
381+
               (((? string? s) lst ...)
382+
                (loop (string-append text s) lst)))))
383+
    ('famn (sxml->string lst))
384+
    ('expl (sxml->string lst))
385+
    ('expli (filter list? lst))
386+
    ('abbrev (filter list? lst))
387+
    ('token (filter string? lst))
388+
    ('tr (sxml->string lst))
389+
    ('transcr `(transcr ,lst))
390+
    ('trans
391+
      (append (filter list? lst)
392+
              (map (lambda (s) `(trans ,s)) (filter string? lst))))
393+
    ('jap `(jap ,lst))
394+
    ('emph lst)
395+
    ('title (filter string? lst))
396+
    ('transl (filter string? lst))
397+
    ('topic (filter string? lst))
398+
    ('iron (filter string? lst))
399+
    ('specchar (filter string? lst))
400+
    ('scientif (filter string? lst))
401+
    ('wikide #f)
402+
    ('wikija #f)
403+
    ('link #f)
404+
    ('ref (sxml->ref lst))
405+
    ('sref (sxml->ref lst))
406+
    ('etym (sxml->source lst))
407+
    ('literal (sxml->string (list "???" (sxml->string lst) "???")))
408+
    ('def (sxml->string (list "(" (sxml->string lst) ")")))
409+
    ('date (sxml->string (list "(" (sxml->string lst) ")")))
410+
    ('birthdeath (sxml->string (list "(" (sxml->string lst) ")")))
411+
    ('descr (sxml->string (list "(" (sxml->string lst) ")")))
412+
    ('bracket (sxml->string (list "[" (sxml->string lst) "]")))
413+
    ('foreign (if (null? lst) #f `(foreign ,(car lst))))
414+
    ('seasonword `(info ,(string-append "season: " (assoc-ref lst 'type))))
415+
    ('usg `(infos . ,(usg->infos lst)))
416+
    ('sense (sxml->meaning lst))
417+
    ('steinhaus (let ((ref (sxml->string lst)))
418+
                  (if (string? ref)
419+
                      `(ref ,(sxml->string lst))
420+
                      (throw 'not-steinhaus ref))))
421+
    ('pos '()); TODO: actually find what info to use
422+
    ('wordcomponent (cons 'wordcomponent lst))
423+
    ('meishi (cons 'meishi lst))
424+
    ('setsuzokushi (cons 'setsuzokushi lst))
425+
    ('daimeishi (cons 'daimeishi lst))
426+
    ('doushi (cons 'doushi lst))
427+
    ('kandoushi (cons 'kandoushi lst))
428+
    ('keiyoudoushi (cons 'keiyoudoushi lst))
429+
    ('keiyoushi (cons 'keiyoushi lst))
430+
    ('fukushi (cons 'fukushi lst))
431+
    ('rengo (cons 'rengo lst))
432+
    ('suffix (cons 'suffix lst))
433+
    ('prefix (cons 'prefix lst))
434+
    ('kanji (cons 'kanji lst))
435+
    ('rentaishi (cons 'rentaishi lst))
436+
    ('specialcharacter (cons 'specialcharacter lst))
437+
    ('joshi (cons 'joshi lst))
438+
    ('fukujoshi (cons 'fukujoshi lst))
439+
    ('kakujoshi (cons 'kakujoshi lst))
440+
    ('kakarijoshi (cons 'kakarijoshi lst))
441+
    ('shuujoshi (cons 'shuujoshi lst))
442+
    ('setsuzokujoshi (cons 'setsuzokujoshi lst))
443+
    ('jokeiyoushi (cons 'jokeiyoushi lst))
444+
    ('jodoushi (cons 'jodoushi lst))
445+
    ('ruigos lst)
446+
    ('ruigo (ruigo->ref lst))
447+
    ('gramGrp (gram->info lst))))
451448
452449
(define (create-parser frq)
453450
  (define results '())

458455
    
459456
    FINISH-ELEMENT
460457
    (lambda (elem-gi attributes namespaces parent-seed seed)
461-
      (cond
462-
        ((equal? elem-gi 'entries)
463-
         results)
464-
        ((equal? elem-gi 'entry)
465-
         (set! results (cons seed results))
466-
         #f)
467-
        (else
468-
         (let* ((seed (reverse seed))
469-
                (element (sxml->element seed elem-gi frq)))
470-
           (cons element parent-seed)))))
458+
      (let ((elem-gi (match elem-gi
459+
                       ((_ . elem) elem)
460+
                       (_ elem-gi))))
461+
        (cond
462+
          ((equal? elem-gi 'entries)
463+
           results)
464+
          ((equal? elem-gi 'entry)
465+
           (let ((entry (sxml->result seed frq)))
466+
             (set! results (cons entry results)))
467+
           #f)
468+
          (else
469+
           (let* ((seed (reverse seed))
470+
                  (element (sxml->element seed elem-gi frq)))
471+
             (cons element parent-seed))))))
471472
    
472473
    CHAR-DATA-HANDLER
473474
    (lambda (string1 string2 seed)