From 3962b1bdfb2a8a2e3a5ff4f4e51a61b0c44f2e6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?John=20Ankarstr=C3=B6m?= Date: Tue, 23 Aug 2022 01:43:09 +0200 Subject: Add Managed (generic RAII type). --- c/data.cpp | 98 ++++++++++++++++++++++++-------------------------------------- 1 file changed, 38 insertions(+), 60 deletions(-) (limited to 'c/data.cpp') diff --git a/c/data.cpp b/c/data.cpp index c1e7177..8d064a8 100644 --- a/c/data.cpp +++ b/c/data.cpp @@ -7,63 +7,25 @@ #include "data.h" #include "win.h" -struct InternetFile +struct XmlError : public std::exception { - InternetFile(const wchar_t* url) + const char* msg; + XmlError() { - hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0); - if (!hi) - throw Win32Error(); - - hiUrl = InternetOpenUrl(hi, url, - nullptr, 0, INTERNET_FLAG_NO_UI, 0); - if (!hiUrl) { - DWORD e = GetLastError(); - InternetCloseHandle(hi); - throw InternetError(e); - } + msg = xmlGetLastError()->message; } - - ~InternetFile() + virtual const char* what() const noexcept { - InternetCloseHandle(hiUrl); - InternetCloseHandle(hi); - } - - DWORD Read(void* buf, DWORD cb) - { - DWORD cbRead; - if (InternetReadFile(hiUrl, buf, cb, &cbRead)) - return cbRead; - else - throw InternetError(); + return msg; } - - HINTERNET hi; - HINTERNET hiUrl; }; -template -struct XmlPtr -{ - XmlPtr(T v) : v(v) - { - if (!v) - throw std::runtime_error(xmlGetLastError()->message); - } - ~XmlPtr() { F(v); } - operator T() { return v; } - T operator ->() { return v; } -private: - T v; -}; - -using HtmlParserCtxtPtr = XmlPtr; -using HtmlDocPtr = XmlPtr; -using XmlXPathContextPtr = XmlPtr; -using XmlXPathObjectPtr = XmlPtr; +using HtmlParserCtxtPtr = Managed; +using HtmlDocPtr = Managed; +using XmlXPathContextPtr = Managed; +using XmlXPathObjectPtr = Managed; static inline void XmlFree(void* p) { xmlFree(p); } -using XmlCharPtr = XmlPtr; +using XmlCharPtr = Managed; template bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept @@ -93,24 +55,36 @@ bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept void FetchData() { LIBXML_TEST_VERSION; + using InternetHandle = Managed; - InternetFile inf(L"https://www.detectiveconanworld.com/wiki/Anime"); - //InternetFile inf(L"file://C:/Users/John/Desktop/dcw.html"); - char buf[1024]; + /* The remote data is retrieved using WinINet from the + * Detective Conan World wiki. Using libxml2's "push parser", + * the HTML is parsed piece by piece as it is retrieved. */ + InternetHandle hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0); + InternetHandle hiUrl = InternetOpenUrl(hi, L"https://www.detectiveconanworld.com/wiki/Anime", + nullptr, 0, INTERNET_FLAG_NO_UI, 0); //L"file://C:/Users/John/Desktop/dcw.html" + + char buf[1024]; HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", XML_CHAR_ENCODING_UTF8); - htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); - while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) { + BOOL r; + DWORD cbRead; + while (r = InternetReadFile(hiUrl, buf, sizeof(buf), &cbRead), cbRead) { + if (!r) + throw InternetError(); if (!htmlParseChunk(ctxt, buf, cbRead, 0)) throw std::runtime_error(xmlGetLastError()->message); } htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ - /* Find table rows containing episode data. */ + /* The episode data are contained in table rows matching a + * (very!) specific XPath query. This is fragile + * theoretically, but unlikely to break practically. */ + HtmlDocPtr doc = ctxt->myDoc; XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( @@ -133,7 +107,10 @@ void FetchData() ElvDataA& e = g_fvElv.At(i); DlvDataA& d = g_fvDlv.At(i); - /* Get cells. */ + /* Each datum is contained within a specific cell in + * the row. The child element count above ensures that + * none of the following nodes are null. */ + const xmlNodePtr nodeEp = xmlFirstElementChild(node); const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp)); const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle); @@ -142,17 +119,18 @@ void FetchData() const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource); WcharsFromXmlchars(e.siEp, xmlNodeGetContent(nodeEp)); - e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */ + e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove potential "WPS" suffix. */ WcharsFromXmlchars(e.title, xmlNodeGetContent(nodeTitle)); WcharsFromXmlchars(d.date, xmlNodeGetContent(nodeDate)); WcharsFromXmlchars(d.source, xmlNodeGetContent(nodeSource)); WcharsFromXmlchars(d.hint, xmlNodeGetContent(nodeHint)); - /* Get wiki URL. */ + /* Retrieve the link to the episode's wiki entry, + * which should be the first (and only) child element + * of the title node. */ const xmlNodePtr nodeLink = xmlFirstElementChild(nodeTitle); if (nodeLink) - WcharsFromXmlchars(d.wiki, - xmlGetProp(nodeLink, reinterpret_cast("href"))); + WcharsFromXmlchars(d.wiki, xmlGetProp(nodeLink, reinterpret_cast("href"))); } } -- cgit v1.2.3