Build full wadoku dictionary

Julien LepillerThu Jun 04 17:05:22+0200 2020

556e145

Build full wadoku dictionary

Makefile

11
.PRECIOUS: po/%.po dictionaries/%
22
3+
all: site
4+
35
JMDICT_LANGS=dut fre ger hun rus slv spa swe
46
57
DICOS=dicos/JMdict_e.nani $(addprefix dicos/JMdict_, $(addsuffix .nani, $(JMDICT_LANGS)))
68
9+
DOWNLOADS=dictionaries/JMdict_e.xml dictionaries/JMdict.xml dictionaries/frequency.tsv
10+
11+
include radicals.mk
12+
include wadoku.mk
13+
714
PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm
815
916
HAUNT_FILES= haunt.scm $(addprefix pages/, $(PAGES)) \

2128
LANGS=fr
2229
MOFILES=$(addprefix po/, $(addsuffix /LC_MESSAGES/nani.mo, $(LANGS)))
2330
24-
all: $(MOFILES) $(WEB_FILES)
31+
site: $(MOFILES) $(WEB_FILES)
2532
	haunt build
2633
	rm -rf public.bak
2734
	mv public public.bak
2835
	mv site public
29-
	touch all
36+
	touch site
3037
3138
download:
3239
	@rm -f dictionaries/*
33-
	@$(MAKE) dictionaries/JMdict_e.xml dictionaries/JMdict.xml dictionaries/frequency.tsv dictionaries/wadoku.xml dictionaries/radkfilex.utf8
40+
	@$(MAKE) $(DOWNLOADS)
3441
3542
# Download JMdict dictionaries from ERDRG
3643
dictionaries/%.xml:

7683
7784
dicos/list: $(DICOS) tools/list.scm $(MOFILES)
7885
	guile -L modules -L . tools/list.scm $@ $(DICOS)
79-
80-
include radicals.mk
81-
include wadoku.mk

modules/nani/tags.scm

3030
    "telecom" "build" "sci" "school" "art" "hobby"
3131
3232
    "season-spring" "season-summer" "season-winter" "season-autumn"
33-
    
33+
    "season-newyear"
34+
35+
    "jlpt-1" "jlpt-2" "jlpt-3" "jlpt-4" "jlpt-5"
36+
3437
    "frq500" "frq1000" "frq2000" "frq5000" "frq10000" "frq20000"))
3538
3639
(define (get-tag tag)

modules/nani/wadoku/entities.scm

2020
    ("Anat." . "anat")
2121
    ("Angeln" . "sports")
2222
    ("Anime" . "work")
23-
    ("Anthropol." . "antho")
23+
    ("Anthropol." . "anth")
2424
    ("arab. Gesch." . "hist")
2525
    ("Arch??ol." . "archeo")
2626
    ("Archit." . "archit")

104104
    ("dtsch. Gramm." . "ling")
105105
    ("EDV" . "comp")
106106
    ("Einzel-Kanji" . "ling")
107-
    ("Streckenkilometertafel" . "train")
107+
    ("Eisenb." . "engr")
108108
    ("Eishockey" . "sports")
109109
    ("Eiskunstl." . "sports")
110110
    ("Eiskunstlauf" . "sports")

141141
    ("Frisur" . "fashion")
142142
    ("Funkt." . "physics")
143143
    ("Fu??ball" . "sports")
144-
145144
    ("Gagaku" . "music")
146-
    ("Garten" . "bota")
147-
    ("Gartenk." . "bota")
145+
    ("Garten" . "bot")
146+
    ("Gartenk." . "bot")
148147
    ("Gateball" . "sports")
149148
    ("Gebietsn." . "place")
150149
    ("Geldw." . "finc")

220219
    ("Kanbun" . "ling")
221220
    ("Kanji" . "ling")
222221
    ("Kanp??" . "med")
223-
    ("Karten" . "games")
222+
    ("Karten" . "game")
224223
    ("Kartenspiel" . "game")
225224
    ("Kartogr." . "place")
226225
    ("kath. Christent." . "Christn")

285284
    ("Mus." . "music")
286285
    ("Muschelk." . "zool")
287286
    ("Musikinstr." . "music")
288-
    ("Mykol." . "bota")
287+
    ("Mykol." . "bot")
289288
    ("Mythol." . "rel")
290289
    ("Nachrichtent." . "politics")
291290
    ("N??hen" . "fashion")

394393
    ("Steuerw." . "econ")
395394
    ("Sticken" . "build")
396395
    ("Stra??enbau" . "engr")
396+
    ("Streckenkilometertafel" . "train")
397397
    ("Stricken" . "build")
398398
    ("Strukturalismus" . "philo")
399399
    ("Sum??" . "sumo")

modules/nani/wadoku/xml.scm

5454
        (reg (usage-reg usage)))
5555
    `(,@(if (string-null? content)
5656
            (match type
57-
              ("" '()))
57+
              ("" '())
58+
              ("time" '())
59+
              ("hint" '())
60+
              ("abrev" '(tag "abbr")))
5861
            (match type
5962
              ("dom" `((tag ,content)))
63+
              ("time" `((tag "obs"))); always obsol.
6064
              ("hint" (let ((tag (hint->tag content)))
6165
                        (if tag `(tag ,tag) '())))))
6266
      ,@(match reg
6367
          ("" '())
68+
          ("dial." '())
6469
          ("lit" `((tag "litf")))
6570
          ("kinderspr." `((tag "chn")))
6671
          ("kleinkindspr." `((tag "baby")))
6772
          ("jugendspr." `((tag "young")))
73+
          ("besch.-h??fl." `((tag "hum")))
6874
          ("coll" `((tag "col")))
6975
          ("vulg." `((tag "vulg")))
7076
          ("Slang" `((tag "sl")))

167173
      (loop
168174
        (match (car lst)
169175
          (('impli impli) (update-source source #:lang impli))
170-
          (('foreign foreign) (update-source source #:content foreign))
176+
          (('foreign foreign)
177+
           (update-source source
178+
             #:content (if (list? foreign)
179+
                           (append foreign (source-content source))
180+
                           (cons foreign (source-content source)))))
171181
          ((? list? l) (loop source l))
172182
          (_ source))
173183
        (cdr lst)))))

287297
      ((('meta . _) lst ...)
288298
       (loop tags result lst))
289299
      ((('genki . genki) lst ...)
290-
       (loop (cons `(tag ,genki) tags) result lst))
300+
       ;(loop (cons `(tag ,genki) tags) result lst))
301+
       (loop tags result lst))
291302
      ((('jlpt . jlpt) lst ...)
292303
       (loop (cons `(tag ,(string-append "jlpt-" jlpt)) tags) result lst))
293304
      ((('ref . (? string? ref)) lst ...)
294305
       (loop (cons `(ref ,ref) tags) result lst))
306+
      ((('ref (? string? ref)) lst ...)
307+
       (loop (cons `(ref ,ref) tags) result lst))
295308
      ((('jap l) lst ...)
296309
       (sub-loop loop tags result lst l))
297310
      ((('foreign l) lst ...)

301314
      ((((? symbol? s) v) lst ...)
302315
       (throw 'unsupported-symbol s v))
303316
      ((((? symbol? s) . v) lst ...)
304-
       (throw 'unsupported-symbol s v))
317+
       (throw 'unsupported-symbol-pair s v))
305318
      (((? list? l) lst ...)
306319
       (sub-loop loop tags result lst l)))))
307320

radicals.mk

11
RADK_MODULES=tools/radk.scm modules/nani/radk.scm modules/nani/serialize.scm
22
DICOS+=dicos/radicals.nani
3+
DOWNLOADS+=dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml
34
4-
dictionaries/kradzip.zip:
5-
	wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O $@
5+
dictionaries/radkfilex.utf8:
6+
	wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip
7+
	unzip dictionaries/kradzip.zip radkfilex -d dictionaries
8+
	iconv -f euc-jp -t utf-8 dictionaries/radkfilex > $@
9+
	rm radkfilex
610
7-
dictionaries/radkfilex: dictionaries/kradzip.zip
8-
	unzip $^ $$(basename $@) -d $$(dirname $@)
9-
	touch $@
10-
11-
dictionaries/radkfilex.utf8: dictionaries/radkfilex
12-
	iconv -f euc-jp -t utf-8 $^ > $@
13-
14-
dictionaries/kanjidic2.xml.gz:
15-
	wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@
16-
17-
dictionaries/kanjidic2.xml: dictionaries/kanjidic2.xml.gz
18-
	gunzip $^
11+
dictionaries/kanjidic2.xml:
12+
	wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@.gz
13+
	gunzip $@.gz
1914
2015
dicos/radicals.nani: tools/radk.scm dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $(RADK_MODULES)
2116
	guile -L modules tools/radk.scm build dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $@

tools/wadoku.scm

4141
                       results))
4242
         (results (filter (lambda (result) (not (null? (result-senses result))))
4343
                          results)))
44-
    (pk (car results))
45-
    (let ((readings (apply append (map result-readings results))))
46-
      (pk (apply append (map reading-readings readings))))
4744
    results))
4845
4946
(define (compile input sense-filter output)

wadoku.mk

11
WADOKU_TMP_DIR=dictionaries/wadoku-tmp
22
DICOS+=dicos/wadoku_ger.nani
3+
DOWNLOADS+=dictionaries/wadoku.xml
34
45
dictionaries/wadoku.xml:
56
	mkdir $(WADOKU_TMP_DIR)