1 ;;; org-tag-eldoc-wikipedia.el --- Web Scraping on Wikipedia -*- lexical-binding: t; -*-
2 ;; -*- coding: utf-8 -*-
4 ;; Copyright (C) 2024-2025 Christopher M. Miles, all rights reserved.
8 ;; search API: https://wikipedia.org/w/index.php?search=[query]
9 ;; tag URL: https://en.wikipedia.org/wiki/Computer_science
14 (require 'url-http
) ; for `url-http-end-of-headers'
17 (require 'elquery nil t
) ; optionally use elquery.el to replace dom.el.
18 (require 'org-tag-eldoc-common
)
21 (defun org-tag-eldoc-wikipedia--request (tag)
22 "Send HTTP GET request to get TAG explanation data."
23 (let ((request-backend 'url-retrieve
) ; use `url-retrieve' backend for proxy.
24 (url-proxy-services org-tag-eldoc-request-proxy
)
25 (url (format "https://en.wikipedia.org/w/index.php?title=%s" (capitalize tag
))))
30 ((fboundp 'libxml-parse-html-region
) ; convert HTML -> Elisp alist structure
31 (libxml-parse-html-region (point-min) (point-max)))
32 ((featurep 'elquery
) ; convert HTML -> elquery object structure
33 (elquery-read-buffer (current-buffer)))))
35 (lambda (&key data
&allow-other-keys
)
36 ;; DEBUG: (setq request-result data)
39 ;; for `libxml' parser
40 (setq org-tag-eldoc--explanation
41 (org-tag-eldoc-common--format-explanation
43 (dom-search (dom-by-class data
"mw-content-ltr\\ mw-parser-output")
44 (lambda (node) (and (eq (cl-first node
) 'p
) (null (cl-second node
)))))))))
46 ;; for `elquery' parser
47 (setq org-tag-eldoc--explanation
48 (elquery-text (car (elquery-$
"#mw-content-text > div.mw-content-ltr.mw-parser-output > p" data
))))))))
50 (lambda (&rest args
&key error-thrown
&allow-other-keys
)
51 (message "[org-tag-eldoc] (Wikipedia) request error %s!" error-thrown
)
53 :status-code
'((404 .
(lambda (&rest _
) nil
))
54 (500 .
(lambda (&rest _
) nil
))))
56 ;; (with-current-buffer (url-retrieve-synchronously url)
58 ;; (goto-char url-http-end-of-headers)
59 ;; (libxml-parse-html-region (point) (point-max)))))
60 ;; ;; DEBUG: (setq request-result dom)
61 ;; (org-tag-eldoc-common--format-explanation
65 ;; (dom-search (dom-by-class dom "mw-content-ltr\\ mw-parser-output")
66 ;; (lambda (node) (and (eq (cl-first node) 'p) (null (cl-second node)))))))
67 ;; ((featurep 'elquery)
68 ;; (elquery-text (car (elquery-$ "#mw-content-text > div.mw-content-ltr.mw-parser-output > p" data))))))))
71 ;;; TEST: https://en.wikipedia.org/w/index.php?title=Computer_science
72 ;; (org-tag-eldoc-wikipedia--request "computer_science")
75 ;; (let ((url-proxy-services org-tag-eldoc-request-proxy))
76 ;; (with-current-buffer
77 ;; (url-retrieve-synchronously "https://en.wikipedia.org/w/index.php?title=Computer_science")
79 ;; (goto-char url-http-end-of-headers)
80 ;; (libxml-parse-html-region (point) (point-max)))))
81 ;; (setq request-result dom)
82 ;; (setq org-tag-eldoc--explanation
84 ;; (dom-search (dom-by-class dom "mw-content-ltr\\ mw-parser-output")
85 ;; (lambda (node) (and (eq (cl-first node) 'p) (null (cl-second node))))))))))
89 ;; (dom-search (dom-by-class request-result "mw-content-ltr\\ mw-parser-output")
90 ;; (lambda (node) (and (eq (cl-first node) 'p) (null (cl-second node))))))
93 ;;; this example works fine.
94 ;; (let ((url-proxy-services org-tag-eldoc-request-proxy))
95 ;; (with-current-buffer (url-retrieve-synchronously "https://en.wikipedia.org/w/index.php?title=GitHub")
96 ;; (let ((dom (libxml-parse-html-region (point-min) (point-max))))
97 ;; (let ((paragraph-node (dom-search (dom-by-class dom "mw-content-ltr\\ mw-parser-output")
98 ;; (lambda (node) (eq (car node) 'p)))))
99 ;; (setq node-github paragraph-node)
100 ;; (dom-texts paragraph-node)))))
102 ;; `node-github' ; looks fine
104 ;;; this example failed on `dom-texts'.
105 ;; (let ((url-proxy-services org-tag-eldoc-request-proxy))
106 ;; (with-current-buffer (url-retrieve-synchronously "https://en.wikipedia.org/w/index.php?title=Computer_science")
107 ;; (let ((dom (libxml-parse-html-region (point-min) (point-max))))
108 ;; (let ((paragraph-node (dom-search (dom-by-class dom "mw-content-ltr\\ mw-parser-output")
109 ;; (lambda (node) (eq (car node) 'p)))))
110 ;; (setq node-computer-science paragraph-node)
111 ;; (dom-texts paragraph-node)))))
113 ;; `node-computer-science' ; looks fine
115 ;;; this examples works by a new `dom-search' rule.
116 ;; (let ((url-proxy-services org-tag-eldoc-request-proxy))
117 ;; (with-current-buffer (url-retrieve-synchronously "https://en.wikipedia.org/w/index.php?title=Computer_science")
118 ;; (let ((dom (libxml-parse-html-region (point-min) (point-max))))
119 ;; (let ((paragraph-node (dom-search (dom-by-class dom "mw-content-ltr\\ mw-parser-output")
120 ;; (lambda (node) (and (eq (cl-first node) 'p)
121 ;; (null (cl-second node)))))))
122 ;; (setq node-computer-science paragraph-node)
123 ;; (dom-texts paragraph-node)))))
125 (defun org-tag-eldoc-wikipedia-query (tag)
126 "Query TAG on Wikipedia then return a cons cell of tag and explanation."
127 (if (stringp org-tag-eldoc--explanation
)
128 org-tag-eldoc--explanation
129 (org-tag-eldoc-wikipedia--request tag
)
131 (org-tag-eldoc-database-save tag org-tag-eldoc--explanation
)))
135 (provide 'org-tag-eldoc-wikipedia
)
137 ;;; org-tag-eldoc-wikipedia.el ends here