Improve kanjidic and jmdict formats
modules/nani/kanji/kanjidic.scm
132 | 132 | ||
133 | 133 | (define (update-trie-pos! trie kanji) | |
134 | 134 | (let* ((vals (trie-vals trie)) | |
135 | - | (vals (map (lambda (i) (kanji-position (array-ref kanji i))) vals))) | |
135 | + | (vals (match vals | |
136 | + | ((pos) (kanji-position (array-ref kanji pos))) | |
137 | + | (() 0)))) | |
136 | 138 | (trie-vals-set! trie vals)) | |
137 | 139 | (for-each | |
138 | 140 | (match-lambda | |
… | |||
173 | 175 | (sense-huffman-size (bytevector-length sense-huffman-bv)) | |
174 | 176 | (reading-huffman-bv (serialize-huffman reading-huffman)) | |
175 | 177 | (reading-huffman-size (bytevector-length reading-huffman-bv)) | |
176 | - | (serialize-trie (serialize-trie serialize-int int-size)) | |
177 | - | (trie-size (trie-size int-size)) | |
178 | + | (serialize-trie (serialize-trie-single serialize-int int-size)) | |
179 | + | (trie-size (trie-size-single int-size)) | |
178 | 180 | (kanji-trie (make-kanji-trie kanji)) | |
179 | 181 | (kanji-trie-size (trie-size kanji-trie)) | |
180 | 182 | (results-size |
modules/nani/pitch/pitch.scm
58 | 58 | pitches) | |
59 | 59 | (compress-trie trie))) | |
60 | 60 | ||
61 | + | (define (uniq lst) | |
62 | + | (let loop ((lst lst) (res '())) | |
63 | + | (match lst | |
64 | + | (() res) | |
65 | + | ((elem lst ...) | |
66 | + | (if (member elem res) | |
67 | + | (loop lst res) | |
68 | + | (loop lst (cons elem res))))))) | |
69 | + | ||
61 | 70 | (define (collapse-vals! trie) | |
62 | 71 | (let ((transitions (trie-transitions trie)) | |
63 | 72 | (vals (map (match-lambda ((? string? s) s) ((? number? i) (number->string i))) | |
64 | - | (trie-vals trie)))) | |
73 | + | (uniq (trie-vals trie))))) | |
65 | 74 | (trie-vals-set! trie (string-join vals ", ")) | |
66 | 75 | (for-each collapse-vals! (map cdr transitions)))) | |
67 | 76 |
modules/nani/result/jibiki.scm
293 | 293 | ((equal? elem-gi 'volume) | |
294 | 294 | results) | |
295 | 295 | ((equal? elem-gi 'article) | |
296 | - | (set! results (cons seed results)) | |
296 | + | (let ((entry (sxml->result seed frq))) | |
297 | + | (set! results (cons entry results))) | |
297 | 298 | #f) | |
298 | 299 | (else | |
299 | 300 | (let* ((seed (reverse seed)) |
modules/nani/result/jmdict.scm
426 | 426 | ((equal? elem-gi 'JMdict) | |
427 | 427 | results) | |
428 | 428 | ((equal? elem-gi 'entry) | |
429 | - | (set! results (cons seed results)) | |
429 | + | (let ((entry (sxml->result seed frq))) | |
430 | + | (set! results (cons entry results))) | |
430 | 431 | #f) | |
431 | 432 | (else | |
432 | 433 | (let* ((seed (reverse seed)) |
modules/nani/result/wadoku.scm
351 | 351 | (sub-loop loop infos result lst l))))) | |
352 | 352 | ||
353 | 353 | (define (sxml->element lst elem frq) | |
354 | - | (let ((elem (match elem | |
355 | - | ((_ . elem) elem) | |
356 | - | (_ elem)))) | |
357 | - | (match elem | |
358 | - | ('orth (let ((kanji (filter string? lst))) | |
359 | - | (if (null? kanji) | |
360 | - | #f | |
361 | - | `(kanji | |
362 | - | ,(string-filter | |
363 | - | (lambda (c) | |
364 | - | ;; Remove characters that interfere with kanji | |
365 | - | ;; based search | |
366 | - | (not (member c '(#\??? #\??? #\??? #\??? #\??? #\??? #\??? | |
367 | - | #\?? #\???)))) | |
368 | - | (car kanji)))))) | |
369 | - | ('count #f) | |
370 | - | ('entry (sxml->result lst frq)) | |
371 | - | ('hira `(reading ,(car lst))) | |
372 | - | ('hatsuon `(hatsuon ,(car lst))) | |
373 | - | ('accent `(pitch ,(car lst))) | |
374 | - | ('reading (sxml->reading lst)) | |
375 | - | ('form lst) | |
376 | - | ('impli `(impli ,(car lst))) | |
377 | - | ('text (let loop ((text "") (lst lst)) | |
378 | - | (match lst | |
379 | - | (() text) | |
380 | - | ((('hasPrecedingSpace . _) lst ...) | |
381 | - | (loop (string-append " " text) lst)) | |
382 | - | ((('hasFollowingSpace . _) lst ...) | |
383 | - | (string-append (loop text lst) " ")) | |
384 | - | (((? string? s) lst ...) | |
385 | - | (loop (string-append text s) lst))))) | |
386 | - | ('famn (sxml->string lst)) | |
387 | - | ('expl (sxml->string lst)) | |
388 | - | ('expli (filter list? lst)) | |
389 | - | ('abbrev (filter list? lst)) | |
390 | - | ('token (filter string? lst)) | |
391 | - | ('tr (sxml->string lst)) | |
392 | - | ('transcr `(transcr ,lst)) | |
393 | - | ('trans | |
394 | - | (append (filter list? lst) | |
395 | - | (map (lambda (s) `(trans ,s)) (filter string? lst)))) | |
396 | - | ('jap `(jap ,lst)) | |
397 | - | ('emph lst) | |
398 | - | ('title (filter string? lst)) | |
399 | - | ('transl (filter string? lst)) | |
400 | - | ('topic (filter string? lst)) | |
401 | - | ('iron (filter string? lst)) | |
402 | - | ('specchar (filter string? lst)) | |
403 | - | ('scientif (filter string? lst)) | |
404 | - | ('wikide #f) | |
405 | - | ('wikija #f) | |
406 | - | ('link #f) | |
407 | - | ('ref (sxml->ref lst)) | |
408 | - | ('sref (sxml->ref lst)) | |
409 | - | ('etym (sxml->source lst)) | |
410 | - | ('literal (sxml->string (list "???" (sxml->string lst) "???"))) | |
411 | - | ('def (sxml->string (list "(" (sxml->string lst) ")"))) | |
412 | - | ('date (sxml->string (list "(" (sxml->string lst) ")"))) | |
413 | - | ('birthdeath (sxml->string (list "(" (sxml->string lst) ")"))) | |
414 | - | ('descr (sxml->string (list "(" (sxml->string lst) ")"))) | |
415 | - | ('bracket (sxml->string (list "[" (sxml->string lst) "]"))) | |
416 | - | ('foreign (if (null? lst) #f `(foreign ,(car lst)))) | |
417 | - | ('seasonword `(info ,(string-append "season: " (assoc-ref lst 'type)))) | |
418 | - | ('usg `(infos . ,(usg->infos lst))) | |
419 | - | ('sense (sxml->meaning lst)) | |
420 | - | ('steinhaus (let ((ref (sxml->string lst))) | |
421 | - | (if (string? ref) | |
422 | - | `(ref ,(sxml->string lst)) | |
423 | - | (throw 'not-steinhaus ref)))) | |
424 | - | ('pos '()); TODO: actually find what info to use | |
425 | - | ('wordcomponent (cons 'wordcomponent lst)) | |
426 | - | ('meishi (cons 'meishi lst)) | |
427 | - | ('setsuzokushi (cons 'setsuzokushi lst)) | |
428 | - | ('daimeishi (cons 'daimeishi lst)) | |
429 | - | ('doushi (cons 'doushi lst)) | |
430 | - | ('kandoushi (cons 'kandoushi lst)) | |
431 | - | ('keiyoudoushi (cons 'keiyoudoushi lst)) | |
432 | - | ('keiyoushi (cons 'keiyoushi lst)) | |
433 | - | ('fukushi (cons 'fukushi lst)) | |
434 | - | ('rengo (cons 'rengo lst)) | |
435 | - | ('suffix (cons 'suffix lst)) | |
436 | - | ('prefix (cons 'prefix lst)) | |
437 | - | ('kanji (cons 'kanji lst)) | |
438 | - | ('rentaishi (cons 'rentaishi lst)) | |
439 | - | ('specialcharacter (cons 'specialcharacter lst)) | |
440 | - | ('joshi (cons 'joshi lst)) | |
441 | - | ('fukujoshi (cons 'fukujoshi lst)) | |
442 | - | ('kakujoshi (cons 'kakujoshi lst)) | |
443 | - | ('kakarijoshi (cons 'kakarijoshi lst)) | |
444 | - | ('shuujoshi (cons 'shuujoshi lst)) | |
445 | - | ('setsuzokujoshi (cons 'setsuzokujoshi lst)) | |
446 | - | ('jokeiyoushi (cons 'jokeiyoushi lst)) | |
447 | - | ('jodoushi (cons 'jodoushi lst)) | |
448 | - | ('ruigos lst) | |
449 | - | ('ruigo (ruigo->ref lst)) | |
450 | - | ('gramGrp (gram->info lst))))) | |
354 | + | (match elem | |
355 | + | ('orth (let ((kanji (filter string? lst))) | |
356 | + | (if (null? kanji) | |
357 | + | #f | |
358 | + | `(kanji | |
359 | + | ,(string-filter | |
360 | + | (lambda (c) | |
361 | + | ;; Remove characters that interfere with kanji | |
362 | + | ;; based search | |
363 | + | (not (member c '(#\??? #\??? #\??? #\??? #\??? #\??? #\??? | |
364 | + | #\?? #\???)))) | |
365 | + | (car kanji)))))) | |
366 | + | ('count #f) | |
367 | + | ('entry (sxml->result lst frq)) | |
368 | + | ('hira `(reading ,(car lst))) | |
369 | + | ('hatsuon `(hatsuon ,(car lst))) | |
370 | + | ('accent `(pitch ,(car lst))) | |
371 | + | ('reading (sxml->reading lst)) | |
372 | + | ('form lst) | |
373 | + | ('impli `(impli ,(car lst))) | |
374 | + | ('text (let loop ((text "") (lst lst)) | |
375 | + | (match lst | |
376 | + | (() text) | |
377 | + | ((('hasPrecedingSpace . _) lst ...) | |
378 | + | (loop (string-append " " text) lst)) | |
379 | + | ((('hasFollowingSpace . _) lst ...) | |
380 | + | (string-append (loop text lst) " ")) | |
381 | + | (((? string? s) lst ...) | |
382 | + | (loop (string-append text s) lst))))) | |
383 | + | ('famn (sxml->string lst)) | |
384 | + | ('expl (sxml->string lst)) | |
385 | + | ('expli (filter list? lst)) | |
386 | + | ('abbrev (filter list? lst)) | |
387 | + | ('token (filter string? lst)) | |
388 | + | ('tr (sxml->string lst)) | |
389 | + | ('transcr `(transcr ,lst)) | |
390 | + | ('trans | |
391 | + | (append (filter list? lst) | |
392 | + | (map (lambda (s) `(trans ,s)) (filter string? lst)))) | |
393 | + | ('jap `(jap ,lst)) | |
394 | + | ('emph lst) | |
395 | + | ('title (filter string? lst)) | |
396 | + | ('transl (filter string? lst)) | |
397 | + | ('topic (filter string? lst)) | |
398 | + | ('iron (filter string? lst)) | |
399 | + | ('specchar (filter string? lst)) | |
400 | + | ('scientif (filter string? lst)) | |
401 | + | ('wikide #f) | |
402 | + | ('wikija #f) | |
403 | + | ('link #f) | |
404 | + | ('ref (sxml->ref lst)) | |
405 | + | ('sref (sxml->ref lst)) | |
406 | + | ('etym (sxml->source lst)) | |
407 | + | ('literal (sxml->string (list "???" (sxml->string lst) "???"))) | |
408 | + | ('def (sxml->string (list "(" (sxml->string lst) ")"))) | |
409 | + | ('date (sxml->string (list "(" (sxml->string lst) ")"))) | |
410 | + | ('birthdeath (sxml->string (list "(" (sxml->string lst) ")"))) | |
411 | + | ('descr (sxml->string (list "(" (sxml->string lst) ")"))) | |
412 | + | ('bracket (sxml->string (list "[" (sxml->string lst) "]"))) | |
413 | + | ('foreign (if (null? lst) #f `(foreign ,(car lst)))) | |
414 | + | ('seasonword `(info ,(string-append "season: " (assoc-ref lst 'type)))) | |
415 | + | ('usg `(infos . ,(usg->infos lst))) | |
416 | + | ('sense (sxml->meaning lst)) | |
417 | + | ('steinhaus (let ((ref (sxml->string lst))) | |
418 | + | (if (string? ref) | |
419 | + | `(ref ,(sxml->string lst)) | |
420 | + | (throw 'not-steinhaus ref)))) | |
421 | + | ('pos '()); TODO: actually find what info to use | |
422 | + | ('wordcomponent (cons 'wordcomponent lst)) | |
423 | + | ('meishi (cons 'meishi lst)) | |
424 | + | ('setsuzokushi (cons 'setsuzokushi lst)) | |
425 | + | ('daimeishi (cons 'daimeishi lst)) | |
426 | + | ('doushi (cons 'doushi lst)) | |
427 | + | ('kandoushi (cons 'kandoushi lst)) | |
428 | + | ('keiyoudoushi (cons 'keiyoudoushi lst)) | |
429 | + | ('keiyoushi (cons 'keiyoushi lst)) | |
430 | + | ('fukushi (cons 'fukushi lst)) | |
431 | + | ('rengo (cons 'rengo lst)) | |
432 | + | ('suffix (cons 'suffix lst)) | |
433 | + | ('prefix (cons 'prefix lst)) | |
434 | + | ('kanji (cons 'kanji lst)) | |
435 | + | ('rentaishi (cons 'rentaishi lst)) | |
436 | + | ('specialcharacter (cons 'specialcharacter lst)) | |
437 | + | ('joshi (cons 'joshi lst)) | |
438 | + | ('fukujoshi (cons 'fukujoshi lst)) | |
439 | + | ('kakujoshi (cons 'kakujoshi lst)) | |
440 | + | ('kakarijoshi (cons 'kakarijoshi lst)) | |
441 | + | ('shuujoshi (cons 'shuujoshi lst)) | |
442 | + | ('setsuzokujoshi (cons 'setsuzokujoshi lst)) | |
443 | + | ('jokeiyoushi (cons 'jokeiyoushi lst)) | |
444 | + | ('jodoushi (cons 'jodoushi lst)) | |
445 | + | ('ruigos lst) | |
446 | + | ('ruigo (ruigo->ref lst)) | |
447 | + | ('gramGrp (gram->info lst)))) | |
451 | 448 | ||
452 | 449 | (define (create-parser frq) | |
453 | 450 | (define results '()) | |
… | |||
458 | 455 | ||
459 | 456 | FINISH-ELEMENT | |
460 | 457 | (lambda (elem-gi attributes namespaces parent-seed seed) | |
461 | - | (cond | |
462 | - | ((equal? elem-gi 'entries) | |
463 | - | results) | |
464 | - | ((equal? elem-gi 'entry) | |
465 | - | (set! results (cons seed results)) | |
466 | - | #f) | |
467 | - | (else | |
468 | - | (let* ((seed (reverse seed)) | |
469 | - | (element (sxml->element seed elem-gi frq))) | |
470 | - | (cons element parent-seed))))) | |
458 | + | (let ((elem-gi (match elem-gi | |
459 | + | ((_ . elem) elem) | |
460 | + | (_ elem-gi)))) | |
461 | + | (cond | |
462 | + | ((equal? elem-gi 'entries) | |
463 | + | results) | |
464 | + | ((equal? elem-gi 'entry) | |
465 | + | (let ((entry (sxml->result seed frq))) | |
466 | + | (set! results (cons entry results))) | |
467 | + | #f) | |
468 | + | (else | |
469 | + | (let* ((seed (reverse seed)) | |
470 | + | (element (sxml->element seed elem-gi frq))) | |
471 | + | (cons element parent-seed)))))) | |
471 | 472 | ||
472 | 473 | CHAR-DATA-HANDLER | |
473 | 474 | (lambda (string1 string2 seed) |