aboutsummaryrefslogtreecommitdiff
path: root/c/data.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'c/data.cpp')
-rw-r--r--c/data.cpp98
1 files changed, 38 insertions, 60 deletions
diff --git a/c/data.cpp b/c/data.cpp
index c1e7177..8d064a8 100644
--- a/c/data.cpp
+++ b/c/data.cpp
@@ -7,63 +7,25 @@
#include "data.h"
#include "win.h"
-struct InternetFile
+struct XmlError : public std::exception
{
- InternetFile(const wchar_t* url)
+ const char* msg;
+ XmlError()
{
- hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0);
- if (!hi)
- throw Win32Error();
-
- hiUrl = InternetOpenUrl(hi, url,
- nullptr, 0, INTERNET_FLAG_NO_UI, 0);
- if (!hiUrl) {
- DWORD e = GetLastError();
- InternetCloseHandle(hi);
- throw InternetError(e);
- }
+ msg = xmlGetLastError()->message;
}
-
- ~InternetFile()
+ virtual const char* what() const noexcept
{
- InternetCloseHandle(hiUrl);
- InternetCloseHandle(hi);
- }
-
- DWORD Read(void* buf, DWORD cb)
- {
- DWORD cbRead;
- if (InternetReadFile(hiUrl, buf, cb, &cbRead))
- return cbRead;
- else
- throw InternetError();
+ return msg;
}
-
- HINTERNET hi;
- HINTERNET hiUrl;
};
-template <auto F, typename T>
-struct XmlPtr
-{
- XmlPtr(T v) : v(v)
- {
- if (!v)
- throw std::runtime_error(xmlGetLastError()->message);
- }
- ~XmlPtr() { F(v); }
- operator T() { return v; }
- T operator ->() { return v; }
-private:
- T v;
-};
-
-using HtmlParserCtxtPtr = XmlPtr<xmlFreeParserCtxt, htmlParserCtxtPtr>;
-using HtmlDocPtr = XmlPtr<xmlFreeDoc, htmlDocPtr>;
-using XmlXPathContextPtr = XmlPtr<xmlXPathFreeContext, xmlXPathContextPtr>;
-using XmlXPathObjectPtr = XmlPtr<xmlXPathFreeObject, xmlXPathObjectPtr>;
+using HtmlParserCtxtPtr = Managed<htmlParserCtxtPtr, xmlFreeParserCtxt, XmlError>;
+using HtmlDocPtr = Managed<htmlDocPtr, xmlFreeDoc, XmlError>;
+using XmlXPathContextPtr = Managed<xmlXPathContextPtr, xmlXPathFreeContext, XmlError>;
+using XmlXPathObjectPtr = Managed<xmlXPathObjectPtr, xmlXPathFreeObject, XmlError>;
static inline void XmlFree(void* p) { xmlFree(p); }
-using XmlCharPtr = XmlPtr<XmlFree, xmlChar*>;
+using XmlCharPtr = Managed<xmlChar*, XmlFree, XmlError>;
template <size_t N>
bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept
@@ -93,24 +55,36 @@ bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept
void FetchData()
{
LIBXML_TEST_VERSION;
+ using InternetHandle = Managed<HINTERNET, InternetCloseHandle, InternetError>;
- InternetFile inf(L"https://www.detectiveconanworld.com/wiki/Anime");
- //InternetFile inf(L"file://C:/Users/John/Desktop/dcw.html");
- char buf[1024];
+ /* The remote data is retrieved using WinINet from the
+ * Detective Conan World wiki. Using libxml2's "push parser",
+ * the HTML is parsed piece by piece as it is retrieved. */
+ InternetHandle hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0);
+ InternetHandle hiUrl = InternetOpenUrl(hi, L"https://www.detectiveconanworld.com/wiki/Anime",
+ nullptr, 0, INTERNET_FLAG_NO_UI, 0); //L"file://C:/Users/John/Desktop/dcw.html"
+
+ char buf[1024];
HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr,
buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime",
XML_CHAR_ENCODING_UTF8);
-
htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING);
- while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) {
+ BOOL r;
+ DWORD cbRead;
+ while (r = InternetReadFile(hiUrl, buf, sizeof(buf), &cbRead), cbRead) {
+ if (!r)
+ throw InternetError();
if (!htmlParseChunk(ctxt, buf, cbRead, 0))
throw std::runtime_error(xmlGetLastError()->message);
}
htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */
- /* Find table rows containing episode data. */
+ /* The episode data are contained in table rows matching a
+ * (very!) specific XPath query. This is fragile
+ * theoretically, but unlikely to break practically. */
+
HtmlDocPtr doc = ctxt->myDoc;
XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(
@@ -133,7 +107,10 @@ void FetchData()
ElvDataA& e = g_fvElv.At(i);
DlvDataA& d = g_fvDlv.At(i);
- /* Get cells. */
+ /* Each datum is contained within a specific cell in
+ * the row. The child element count above ensures that
+ * none of the following nodes are null. */
+
const xmlNodePtr nodeEp = xmlFirstElementChild(node);
const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp));
const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle);
@@ -142,17 +119,18 @@ void FetchData()
const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource);
WcharsFromXmlchars(e.siEp, xmlNodeGetContent(nodeEp));
- e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */
+ e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove potential "WPS" suffix. */
WcharsFromXmlchars(e.title, xmlNodeGetContent(nodeTitle));
WcharsFromXmlchars(d.date, xmlNodeGetContent(nodeDate));
WcharsFromXmlchars(d.source, xmlNodeGetContent(nodeSource));
WcharsFromXmlchars(d.hint, xmlNodeGetContent(nodeHint));
- /* Get wiki URL. */
+ /* Retrieve the link to the episode's wiki entry,
+ * which should be the first (and only) child element
+ * of the title node. */
const xmlNodePtr nodeLink = xmlFirstElementChild(nodeTitle);
if (nodeLink)
- WcharsFromXmlchars(d.wiki,
- xmlGetProp(nodeLink, reinterpret_cast<const xmlChar*>("href")));
+ WcharsFromXmlchars(d.wiki, xmlGetProp(nodeLink, reinterpret_cast<const xmlChar*>("href")));
}
}