Build full wadoku dictionary
Makefile
1 | 1 | .PRECIOUS: po/%.po dictionaries/% | |
2 | 2 | ||
3 | + | all: site | |
4 | + | ||
3 | 5 | JMDICT_LANGS=dut fre ger hun rus slv spa swe | |
4 | 6 | ||
5 | 7 | DICOS=dicos/JMdict_e.nani $(addprefix dicos/JMdict_, $(addsuffix .nani, $(JMDICT_LANGS))) | |
6 | 8 | ||
9 | + | DOWNLOADS=dictionaries/JMdict_e.xml dictionaries/JMdict.xml dictionaries/frequency.tsv | |
10 | + | ||
11 | + | include radicals.mk | |
12 | + | include wadoku.mk | |
13 | + | ||
7 | 14 | PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm | |
8 | 15 | ||
9 | 16 | HAUNT_FILES= haunt.scm $(addprefix pages/, $(PAGES)) \ | |
… | |||
21 | 28 | LANGS=fr | |
22 | 29 | MOFILES=$(addprefix po/, $(addsuffix /LC_MESSAGES/nani.mo, $(LANGS))) | |
23 | 30 | ||
24 | - | all: $(MOFILES) $(WEB_FILES) | |
31 | + | site: $(MOFILES) $(WEB_FILES) | |
25 | 32 | haunt build | |
26 | 33 | rm -rf public.bak | |
27 | 34 | mv public public.bak | |
28 | 35 | mv site public | |
29 | - | touch all | |
36 | + | touch site | |
30 | 37 | ||
31 | 38 | download: | |
32 | 39 | @rm -f dictionaries/* | |
33 | - | @$(MAKE) dictionaries/JMdict_e.xml dictionaries/JMdict.xml dictionaries/frequency.tsv dictionaries/wadoku.xml dictionaries/radkfilex.utf8 | |
40 | + | @$(MAKE) $(DOWNLOADS) | |
34 | 41 | ||
35 | 42 | # Download JMdict dictionaries from ERDRG | |
36 | 43 | dictionaries/%.xml: | |
… | |||
76 | 83 | ||
77 | 84 | dicos/list: $(DICOS) tools/list.scm $(MOFILES) | |
78 | 85 | guile -L modules -L . tools/list.scm $@ $(DICOS) | |
79 | - | ||
80 | - | include radicals.mk | |
81 | - | include wadoku.mk |
modules/nani/tags.scm
30 | 30 | "telecom" "build" "sci" "school" "art" "hobby" | |
31 | 31 | ||
32 | 32 | "season-spring" "season-summer" "season-winter" "season-autumn" | |
33 | - | ||
33 | + | "season-newyear" | |
34 | + | ||
35 | + | "jlpt-1" "jlpt-2" "jlpt-3" "jlpt-4" "jlpt-5" | |
36 | + | ||
34 | 37 | "frq500" "frq1000" "frq2000" "frq5000" "frq10000" "frq20000")) | |
35 | 38 | ||
36 | 39 | (define (get-tag tag) |
modules/nani/wadoku/entities.scm
20 | 20 | ("Anat." . "anat") | |
21 | 21 | ("Angeln" . "sports") | |
22 | 22 | ("Anime" . "work") | |
23 | - | ("Anthropol." . "antho") | |
23 | + | ("Anthropol." . "anth") | |
24 | 24 | ("arab. Gesch." . "hist") | |
25 | 25 | ("Arch??ol." . "archeo") | |
26 | 26 | ("Archit." . "archit") | |
… | |||
104 | 104 | ("dtsch. Gramm." . "ling") | |
105 | 105 | ("EDV" . "comp") | |
106 | 106 | ("Einzel-Kanji" . "ling") | |
107 | - | ("Streckenkilometertafel" . "train") | |
107 | + | ("Eisenb." . "engr") | |
108 | 108 | ("Eishockey" . "sports") | |
109 | 109 | ("Eiskunstl." . "sports") | |
110 | 110 | ("Eiskunstlauf" . "sports") | |
… | |||
141 | 141 | ("Frisur" . "fashion") | |
142 | 142 | ("Funkt." . "physics") | |
143 | 143 | ("Fu??ball" . "sports") | |
144 | - | ||
145 | 144 | ("Gagaku" . "music") | |
146 | - | ("Garten" . "bota") | |
147 | - | ("Gartenk." . "bota") | |
145 | + | ("Garten" . "bot") | |
146 | + | ("Gartenk." . "bot") | |
148 | 147 | ("Gateball" . "sports") | |
149 | 148 | ("Gebietsn." . "place") | |
150 | 149 | ("Geldw." . "finc") | |
… | |||
220 | 219 | ("Kanbun" . "ling") | |
221 | 220 | ("Kanji" . "ling") | |
222 | 221 | ("Kanp??" . "med") | |
223 | - | ("Karten" . "games") | |
222 | + | ("Karten" . "game") | |
224 | 223 | ("Kartenspiel" . "game") | |
225 | 224 | ("Kartogr." . "place") | |
226 | 225 | ("kath. Christent." . "Christn") | |
… | |||
285 | 284 | ("Mus." . "music") | |
286 | 285 | ("Muschelk." . "zool") | |
287 | 286 | ("Musikinstr." . "music") | |
288 | - | ("Mykol." . "bota") | |
287 | + | ("Mykol." . "bot") | |
289 | 288 | ("Mythol." . "rel") | |
290 | 289 | ("Nachrichtent." . "politics") | |
291 | 290 | ("N??hen" . "fashion") | |
… | |||
394 | 393 | ("Steuerw." . "econ") | |
395 | 394 | ("Sticken" . "build") | |
396 | 395 | ("Stra??enbau" . "engr") | |
396 | + | ("Streckenkilometertafel" . "train") | |
397 | 397 | ("Stricken" . "build") | |
398 | 398 | ("Strukturalismus" . "philo") | |
399 | 399 | ("Sum??" . "sumo") |
modules/nani/wadoku/xml.scm
54 | 54 | (reg (usage-reg usage))) | |
55 | 55 | `(,@(if (string-null? content) | |
56 | 56 | (match type | |
57 | - | ("" '())) | |
57 | + | ("" '()) | |
58 | + | ("time" '()) | |
59 | + | ("hint" '()) | |
60 | + | ("abrev" '(tag "abbr"))) | |
58 | 61 | (match type | |
59 | 62 | ("dom" `((tag ,content))) | |
63 | + | ("time" `((tag "obs"))); always obsol. | |
60 | 64 | ("hint" (let ((tag (hint->tag content))) | |
61 | 65 | (if tag `(tag ,tag) '()))))) | |
62 | 66 | ,@(match reg | |
63 | 67 | ("" '()) | |
68 | + | ("dial." '()) | |
64 | 69 | ("lit" `((tag "litf"))) | |
65 | 70 | ("kinderspr." `((tag "chn"))) | |
66 | 71 | ("kleinkindspr." `((tag "baby"))) | |
67 | 72 | ("jugendspr." `((tag "young"))) | |
73 | + | ("besch.-h??fl." `((tag "hum"))) | |
68 | 74 | ("coll" `((tag "col"))) | |
69 | 75 | ("vulg." `((tag "vulg"))) | |
70 | 76 | ("Slang" `((tag "sl"))) | |
… | |||
167 | 173 | (loop | |
168 | 174 | (match (car lst) | |
169 | 175 | (('impli impli) (update-source source #:lang impli)) | |
170 | - | (('foreign foreign) (update-source source #:content foreign)) | |
176 | + | (('foreign foreign) | |
177 | + | (update-source source | |
178 | + | #:content (if (list? foreign) | |
179 | + | (append foreign (source-content source)) | |
180 | + | (cons foreign (source-content source))))) | |
171 | 181 | ((? list? l) (loop source l)) | |
172 | 182 | (_ source)) | |
173 | 183 | (cdr lst))))) | |
… | |||
287 | 297 | ((('meta . _) lst ...) | |
288 | 298 | (loop tags result lst)) | |
289 | 299 | ((('genki . genki) lst ...) | |
290 | - | (loop (cons `(tag ,genki) tags) result lst)) | |
300 | + | ;(loop (cons `(tag ,genki) tags) result lst)) | |
301 | + | (loop tags result lst)) | |
291 | 302 | ((('jlpt . jlpt) lst ...) | |
292 | 303 | (loop (cons `(tag ,(string-append "jlpt-" jlpt)) tags) result lst)) | |
293 | 304 | ((('ref . (? string? ref)) lst ...) | |
294 | 305 | (loop (cons `(ref ,ref) tags) result lst)) | |
306 | + | ((('ref (? string? ref)) lst ...) | |
307 | + | (loop (cons `(ref ,ref) tags) result lst)) | |
295 | 308 | ((('jap l) lst ...) | |
296 | 309 | (sub-loop loop tags result lst l)) | |
297 | 310 | ((('foreign l) lst ...) | |
… | |||
301 | 314 | ((((? symbol? s) v) lst ...) | |
302 | 315 | (throw 'unsupported-symbol s v)) | |
303 | 316 | ((((? symbol? s) . v) lst ...) | |
304 | - | (throw 'unsupported-symbol s v)) | |
317 | + | (throw 'unsupported-symbol-pair s v)) | |
305 | 318 | (((? list? l) lst ...) | |
306 | 319 | (sub-loop loop tags result lst l))))) | |
307 | 320 |
radicals.mk
1 | 1 | RADK_MODULES=tools/radk.scm modules/nani/radk.scm modules/nani/serialize.scm | |
2 | 2 | DICOS+=dicos/radicals.nani | |
3 | + | DOWNLOADS+=dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml | |
3 | 4 | ||
4 | - | dictionaries/kradzip.zip: | |
5 | - | wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O $@ | |
5 | + | dictionaries/radkfilex.utf8: | |
6 | + | wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip | |
7 | + | unzip dictionaries/kradzip.zip radkfilex -d dictionaries | |
8 | + | iconv -f euc-jp -t utf-8 dictionaries/radkfilex > $@ | |
9 | + | rm radkfilex | |
6 | 10 | ||
7 | - | dictionaries/radkfilex: dictionaries/kradzip.zip | |
8 | - | unzip $^ $$(basename $@) -d $$(dirname $@) | |
9 | - | touch $@ | |
10 | - | ||
11 | - | dictionaries/radkfilex.utf8: dictionaries/radkfilex | |
12 | - | iconv -f euc-jp -t utf-8 $^ > $@ | |
13 | - | ||
14 | - | dictionaries/kanjidic2.xml.gz: | |
15 | - | wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@ | |
16 | - | ||
17 | - | dictionaries/kanjidic2.xml: dictionaries/kanjidic2.xml.gz | |
18 | - | gunzip $^ | |
11 | + | dictionaries/kanjidic2.xml: | |
12 | + | wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@.gz | |
13 | + | gunzip $@.gz | |
19 | 14 | ||
20 | 15 | dicos/radicals.nani: tools/radk.scm dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $(RADK_MODULES) | |
21 | 16 | guile -L modules tools/radk.scm build dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $@ |
tools/wadoku.scm
41 | 41 | results)) | |
42 | 42 | (results (filter (lambda (result) (not (null? (result-senses result)))) | |
43 | 43 | results))) | |
44 | - | (pk (car results)) | |
45 | - | (let ((readings (apply append (map result-readings results)))) | |
46 | - | (pk (apply append (map reading-readings readings)))) | |
47 | 44 | results)) | |
48 | 45 | ||
49 | 46 | (define (compile input sense-filter output) |
wadoku.mk
1 | 1 | WADOKU_TMP_DIR=dictionaries/wadoku-tmp | |
2 | 2 | DICOS+=dicos/wadoku_ger.nani | |
3 | + | DOWNLOADS+=dictionaries/wadoku.xml | |
3 | 4 | ||
4 | 5 | dictionaries/wadoku.xml: | |
5 | 6 | mkdir $(WADOKU_TMP_DIR) |