diff options
Diffstat (limited to 'c/data.cpp')
-rw-r--r-- | c/data.cpp | 98 |
1 files changed, 38 insertions, 60 deletions
@@ -7,63 +7,25 @@ #include "data.h" #include "win.h" -struct InternetFile +struct XmlError : public std::exception { - InternetFile(const wchar_t* url) + const char* msg; + XmlError() { - hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0); - if (!hi) - throw Win32Error(); - - hiUrl = InternetOpenUrl(hi, url, - nullptr, 0, INTERNET_FLAG_NO_UI, 0); - if (!hiUrl) { - DWORD e = GetLastError(); - InternetCloseHandle(hi); - throw InternetError(e); - } + msg = xmlGetLastError()->message; } - - ~InternetFile() + virtual const char* what() const noexcept { - InternetCloseHandle(hiUrl); - InternetCloseHandle(hi); - } - - DWORD Read(void* buf, DWORD cb) - { - DWORD cbRead; - if (InternetReadFile(hiUrl, buf, cb, &cbRead)) - return cbRead; - else - throw InternetError(); + return msg; } - - HINTERNET hi; - HINTERNET hiUrl; }; -template <auto F, typename T> -struct XmlPtr -{ - XmlPtr(T v) : v(v) - { - if (!v) - throw std::runtime_error(xmlGetLastError()->message); - } - ~XmlPtr() { F(v); } - operator T() { return v; } - T operator ->() { return v; } -private: - T v; -}; - -using HtmlParserCtxtPtr = XmlPtr<xmlFreeParserCtxt, htmlParserCtxtPtr>; -using HtmlDocPtr = XmlPtr<xmlFreeDoc, htmlDocPtr>; -using XmlXPathContextPtr = XmlPtr<xmlXPathFreeContext, xmlXPathContextPtr>; -using XmlXPathObjectPtr = XmlPtr<xmlXPathFreeObject, xmlXPathObjectPtr>; +using HtmlParserCtxtPtr = Managed<htmlParserCtxtPtr, xmlFreeParserCtxt, XmlError>; +using HtmlDocPtr = Managed<htmlDocPtr, xmlFreeDoc, XmlError>; +using XmlXPathContextPtr = Managed<xmlXPathContextPtr, xmlXPathFreeContext, XmlError>; +using XmlXPathObjectPtr = Managed<xmlXPathObjectPtr, xmlXPathFreeObject, XmlError>; static inline void XmlFree(void* p) { xmlFree(p); } -using XmlCharPtr = XmlPtr<XmlFree, xmlChar*>; +using XmlCharPtr = Managed<xmlChar*, XmlFree, XmlError>; template <size_t N> bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept @@ -93,24 +55,36 @@ bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept void FetchData() { LIBXML_TEST_VERSION; + using InternetHandle = Managed<HINTERNET, InternetCloseHandle, InternetError>; - InternetFile inf(L"https://www.detectiveconanworld.com/wiki/Anime"); - //InternetFile inf(L"file://C:/Users/John/Desktop/dcw.html"); - char buf[1024]; + /* The remote data is retrieved using WinINet from the + * Detective Conan World wiki. Using libxml2's "push parser", + * the HTML is parsed piece by piece as it is retrieved. */ + InternetHandle hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0); + InternetHandle hiUrl = InternetOpenUrl(hi, L"https://www.detectiveconanworld.com/wiki/Anime", + nullptr, 0, INTERNET_FLAG_NO_UI, 0); //L"file://C:/Users/John/Desktop/dcw.html" + + char buf[1024]; HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", XML_CHAR_ENCODING_UTF8); - htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); - while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) { + BOOL r; + DWORD cbRead; + while (r = InternetReadFile(hiUrl, buf, sizeof(buf), &cbRead), cbRead) { + if (!r) + throw InternetError(); if (!htmlParseChunk(ctxt, buf, cbRead, 0)) throw std::runtime_error(xmlGetLastError()->message); } htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ - /* Find table rows containing episode data. */ + /* The episode data are contained in table rows matching a + * (very!) specific XPath query. This is fragile + * theoretically, but unlikely to break practically. */ + HtmlDocPtr doc = ctxt->myDoc; XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( @@ -133,7 +107,10 @@ void FetchData() ElvDataA& e = g_fvElv.At(i); DlvDataA& d = g_fvDlv.At(i); - /* Get cells. */ + /* Each datum is contained within a specific cell in + * the row. The child element count above ensures that + * none of the following nodes are null. */ + const xmlNodePtr nodeEp = xmlFirstElementChild(node); const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp)); const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle); @@ -142,17 +119,18 @@ void FetchData() const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource); WcharsFromXmlchars(e.siEp, xmlNodeGetContent(nodeEp)); - e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */ + e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove potential "WPS" suffix. */ WcharsFromXmlchars(e.title, xmlNodeGetContent(nodeTitle)); WcharsFromXmlchars(d.date, xmlNodeGetContent(nodeDate)); WcharsFromXmlchars(d.source, xmlNodeGetContent(nodeSource)); WcharsFromXmlchars(d.hint, xmlNodeGetContent(nodeHint)); - /* Get wiki URL. */ + /* Retrieve the link to the episode's wiki entry, + * which should be the first (and only) child element + * of the title node. */ const xmlNodePtr nodeLink = xmlFirstElementChild(nodeTitle); if (nodeLink) - WcharsFromXmlchars(d.wiki, - xmlGetProp(nodeLink, reinterpret_cast<const xmlChar*>("href"))); + WcharsFromXmlchars(d.wiki, xmlGetProp(nodeLink, reinterpret_cast<const xmlChar*>("href"))); } } |