guile-jsonld/jsonld/download.scm

download.scm

1
;;;; Copyright (C) 2019, 2020 Julien Lepiller <julien@lepiller.eu>
2
;;;; 
3
;;;; This library is free software; you can redistribute it and/or
4
;;;; modify it under the terms of the GNU Lesser General Public
5
;;;; License as published by the Free Software Foundation; either
6
;;;; version 3 of the License, or (at your option) any later version.
7
;;;; 
8
;;;; This library is distributed in the hope that it will be useful,
9
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
10
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
;;;; Lesser General Public License for more details.
12
;;;; 
13
;;;; You should have received a copy of the GNU Lesser General Public
14
;;;; License along with this library; if not, write to the Free Software
15
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
;;;; 
17
18
(define-module (jsonld download)
19
  #:use-module (ice-9 match)
20
  #:use-module (json)
21
  #:use-module (jsonld iri)
22
  #:use-module (jsonld json)
23
  #:use-module (web client)
24
  #:use-module (web response)
25
  #:use-module (web uri)
26
  #:use-module (rnrs bytevectors)
27
  #:use-module (ice-9 match)
28
  #:use-module (srfi srfi-1)
29
  #:use-module (srfi srfi-9)
30
  #:use-module (sxml simple)
31
  #:export (json-document
32
            make-json-document
33
            new-json-document
34
            json-document?
35
            json-document-context-url
36
            json-document-document-url
37
            json-document-document
38
            json-document-content-type
39
            json-document-profile
40
41
            download-json))
42
43
(define-record-type json-document
44
  (make-json-document context-url document-url document content-type profile)
45
  json-document?
46
  (context-url  json-document-context-url)
47
  (document-url json-document-document-url)
48
  (document     json-document-document)
49
  (content-type json-document-content-type)
50
  (profile      json-document-profile))
51
52
(define* (new-json-document #:key context-url document-url document content-type
53
                            profile)
54
  (make-json-document context-url document-url document content-type profile))
55
56
(define-record-type link-header
57
  (make-link-header uri params)
58
  link-header?
59
  (uri    link-header-uri)
60
  (params link-header-params))
61
62
(define-record-type content-type-header
63
  (make-content-type-header type params)
64
  content-type-header?
65
  (type   content-type-header-type)
66
  (params content-type-header-params))
67
68
(define (parse-link-header link)
69
  (let* ((header (string-split link #\;))
70
         (uri (car header))
71
         (uri (substring uri 1 (- (string-length uri) 1)))
72
         (params (cdr header))
73
         (params (map
74
                   (lambda (param)
75
                     (match (map
76
                              (lambda (l) (string-trim l #\space))
77
                              (string-split param #\=))
78
                       ((type val)
79
                        (cons type (substring val 1 (- (string-length val) 1))))))
80
                   params)))
81
    (make-link-header uri params)))
82
83
(define (parse-type type)
84
  (let* ((mime-type (string-split type #\;))
85
         (type (car mime-type))
86
         (params (map
87
                   (lambda (param)
88
                     (match (string-split param #\=)
89
                       ((type val ...)
90
                        (cons (string-trim type #\space)
91
                              (string-trim (string-join val "=") #\space)))))
92
                   (cdr mime-type))))
93
    (make-content-type-header type params)))
94
95
(define (html-document-base base document)
96
  (define (find-base document)
97
    (match document
98
      (('base ('@ attr ...))
99
       (let loop ((attr attr))
100
         (if (null? attr)
101
           #f
102
           (match (car attr)
103
             (('href base) base)
104
             (_ (loop (cdr attr)))))))
105
      ('() #f)
106
      ((tag ('@ opt ...) rest ...)
107
       (fold (lambda (element result)
108
               (or result (find-base element)))
109
             #f rest))
110
      ((tag rest ...)
111
       (fold (lambda (element result)
112
               (or result (find-base element)))
113
             #f rest))
114
      (_ #f)))
115
116
  (let ((new-base (find-base document)))
117
    (if new-base
118
        (resolve-iri base new-base)
119
        base)))
120
121
(define (get-script-elements document)
122
  (define (find-scripts document)
123
    (match document
124
      (('script t ...)
125
       (list document))
126
      ('() '())
127
      ((tag ('@ opt ...) rest ...)
128
       (append-map get-script-elements rest))
129
      ((tag rest ...)
130
       (append-map get-script-elements rest))
131
      (_ '())))
132
133
  (define (jsonld-script? element)
134
    (match element
135
      (('script ('@ opt ...) content)
136
       (let* ((types (filter (lambda (opt) (equal? (car opt) 'type)) opt))
137
              (type (if (null? types) #f (cadr (car types)))))
138
         (and type
139
              (equal? (content-type-header-type (parse-type type))
140
                      "application/ld+json"))))
141
      (_ #f)))
142
143
  (filter jsonld-script? (find-scripts document)))
144
145
(define (get-script-content-by-id document id)
146
  (define (has-correct-id? element)
147
    (match element
148
      ((tag ('@ opts ...) content)
149
       (let* ((ids (filter (lambda (o) (equal? (car o) 'id)) opts))
150
              (element-id (if (null? ids) #f (cadr (car ids)))))
151
         (equal? element-id id)))))
152
153
  (let ((id-scripts (filter has-correct-id? (get-script-elements document))))
154
    (when (null? id-scripts)
155
      (throw 'loading-document-failed))
156
    (get-script-content (car id-scripts))))
157
158
(define (get-script-content-by-profile document profile)
159
  (define (has-correct-profile? element)
160
    (match element
161
      ((tag ('@ opts ...) content)
162
       (let* ((types (filter (lambda (o) (equal? (car o) 'type)) opts))
163
              (type (if (null? types) #f (cadr (car types)))))
164
         (and type
165
              (equal?
166
                (assoc-ref
167
                  (content-type-header-params (parse-type type))
168
                  "profile")
169
                profile))))))
170
171
  (let ((profile-scripts (filter has-correct-profile?
172
                                 (get-script-elements document))))
173
    (when (null? profile-scripts)
174
      (throw 'loading-document-failed))
175
    (get-script-content (car profile-scripts))))
176
177
(define (get-first-script-content document)
178
  (let ((scripts (get-script-elements document)))
179
    (when (null? scripts)
180
      (throw 'loading-document-failed))
181
    (get-script-content (car scripts))))
182
183
(define (get-script-content element)
184
  (match element
185
     (('script ('@ opts ...) content)
186
      content)))
187
188
(define (reverse-document document)
189
  "Reverse the order of elements in @var{document}, a Json object, array or
190
scalar.  This recursively reverses the order of any array or key-pair
191
association."
192
  (match document
193
    ((? json-array? document)
194
     (list->array 1 (map reverse-document (array->list document))))
195
    ((? json-object? document)
196
     (fold (lambda (pair result) (cons (cons (car pair) (reverse-document (cdr pair))) result)) '() document))
197
    (_ document)))
198
199
(define (links-with-rel links rel)
200
  (filter
201
    (lambda (link)
202
      (equal?
203
        (assoc-ref (link-header-params link) "rel")
204
        rel))
205
    links))
206
207
(define (html->sxml document)
208
  "Convert document (a string) into an sxml representation that preserves entities"
209
  (xml->sxml document
210
             #:entities
211
             '((lt . "&amp;lt;")
212
               (gt . "&amp;gt;")
213
               ;(amp . "&amp;amp;")
214
               (apos . "&amp;apos;"))
215
             #:default-entity-handler
216
             (lambda (port name)
217
               (string-append "&amp;" (symbol->string name) ";"))))
218
219
(define* (download-json uri #:key (mime-type "application/ld+json")
220
                        (extract-all-scripts? #f)
221
                        (profile #f) (request-profile #f)
222
                        ;; Used for tests
223
                        (http-get http-get))
224
  "Download a JsonLD document from @var{uri}, using the HTTP protocol.  The
225
Accept HTTP header can be modified with @var{mime-type}."
226
  (define complete-mime-type
227
    (string-append
228
      (if request-profile
229
          (string-append mime-type ";profile=" request-profile)
230
          mime-type)
231
      (if (equal? request-profile "http://www.w3.org/ns/json-ld#context")
232
          ""
233
          ", text/html;q=0.8; application/xhtml+xml;q=0.5")))
234
235
  (catch #t
236
    (lambda ()
237
      (call-with-values
238
        (lambda ()
239
          (http-get uri #:headers `((Accept . ,complete-mime-type))))
240
        (lambda (hdr body)
241
          (cond
242
            ((member (response-code hdr) '(301 302 303 307))
243
             (download-json (uri->string (response-location hdr))
244
                            #:mime-type mime-type
245
                            #:extract-all-scripts? extract-all-scripts?
246
                            #:profile profile
247
                            #:request-profile request-profile))
248
            ((equal? (response-code hdr) 200)
249
             (let* ((content-type (response-content-type hdr))
250
                    (type (symbol->string (car content-type)))
251
                    (link-headers
252
                      (map cdr (filter
253
                                 (lambda (p) (equal? (car p) 'link))
254
                                 (response-headers hdr))))
255
                    (links (map parse-link-header link-headers))
256
                    (alternates (links-with-rel links "alternate"))
257
                    (alternates (filter
258
                                  (lambda (alternate)
259
                                    (equal?
260
                                      (assoc-ref (link-header-params alternate) "type")
261
                                      "application/ld+json"))
262
                                  alternates))
263
                    (contexts (links-with-rel links "http://www.w3.org/ns/json-ld#context"))
264
                    (context-url #f)
265
                    (document #f)
266
                    (base (if (string? uri) uri (uri->string uri))))
267
               (if (and (not (equal? type "application/json"))
268
                        (or
269
                          (< (string-length type) 5)
270
                          (not (equal?
271
                               (substring type
272
                                          (- (string-length type) 5))
273
                               "+json")))
274
                        (not (null? alternates)))
275
                   (download-json (resolve-iri base (link-header-uri (car alternates)))
276
                                  #:mime-type mime-type
277
                                  #:extract-all-scripts? extract-all-scripts?
278
                                  #:profile profile
279
                                  #:request-profile request-profile)
280
                   (begin
281
                     (when (and (or (equal? type "application/json")
282
                                    (and (> (string-length type) 5)
283
                                         (equal? (substring
284
                                                   type (- (string-length type) 5))
285
                                                 "+json")))
286
                                (not (equal? type "application/ld+json"))
287
                                (not (null? contexts)))
288
                       (set! context-url
289
                         (resolve-iri base (link-header-uri (car contexts))))
290
                       (when (> (length contexts) 1)
291
                         (throw 'multiple-context-link-headers)))
292
                     (cond
293
                       ((or (equal? type "application/json")
294
                            (and (> (string-length type) 5)
295
                                 (equal? (substring type
296
                                                    (- (string-length type) 5))
297
                                         "+json")))
298
                        (set! document (reverse-document
299
                                         (json-string->scm
300
                                           (if (string? body)
301
                                               body
302
                                               (utf8->string body))))))
303
                       ((or (equal? type "text/html")
304
                            (equal? type "application/xhtml+xml"))
305
                         (let* ((content (html->sxml
306
                                           (if (string? body)
307
                                               body
308
                                               (utf8->string body))))
309
                                (uri (string->uri base))
310
                                (source #f))
311
                           (set! base (html-document-base base content))
312
                           (when (uri-fragment uri)
313
                             (set! source (get-script-content-by-id
314
                                            content (uri-fragment uri))))
315
                           (when (and (equal? source #f) profile)
316
                             (set! source (get-script-content-by-profile
317
                                            content profile)))
318
                           (when (and (equal? source #f) (not extract-all-scripts?))
319
                             (set! source (get-first-script-content content)))
320
                           (if source
321
                               (catch #t
322
                                 (lambda ()
323
                                   (set! document (reverse-document
324
                                                    (json-string->scm source))))
325
                                 (lambda _
326
                                   (throw 'invalid-script-element source)))
327
                               (begin
328
                                 (unless extract-all-scripts?
329
                                   (throw 'loading-document-failed))
330
                                 (set! document '())
331
                                 (for-each
332
                                   (lambda (el)
333
                                     (catch #t
334
                                       (lambda ()
335
                                         (let ((script-content
336
                                                 (reverse-document
337
                                                   (json-string->scm
338
                                                     (get-script-content el)))))
339
                                           (set! document
340
                                             (append document
341
                                                     (if (json-array? script-content)
342
                                                         (array->list script-content)
343
                                                         (list script-content))))))
344
                                       (lambda _
345
                                         (throw 'invalid-script-element el))))
346
                                   (get-script-elements content))
347
                                 (set! document (list->array 1 document))))))
348
                       (else
349
                         (throw 'loading-remote-document-failed)))
350
                     (new-json-document
351
                       #:document-url base
352
                       #:document document
353
                       #:content-type type
354
                       #:context-url context-url)))))
355
           (else (throw 'not-found))))))
356
    (lambda (key . value)
357
      (cond
358
        ((equal? key 'multiple-context-link-headers)
359
         (apply throw 'multiple-context-link-headers value))
360
        ((equal? key 'invalid-script-element)
361
         (apply throw 'invalid-script-element value))
362
        (else (apply throw 'loading-remote-document-failed key value))))))
363