From 6248b63041d8d5e8ae5b15fe58ccde897b3cfb6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?John=20Ankarstr=C3=B6m?= Date: Tue, 23 Aug 2022 18:52:54 +0200 Subject: Refactor HTML retrieval and parsing. --- c/data.cpp | 67 ++++++++++++++++++++++++++++++++++---------------------------- c/main.cpp | 2 ++ 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/c/data.cpp b/c/data.cpp index 149d9b6..7c0fcb7 100644 --- a/c/data.cpp +++ b/c/data.cpp @@ -20,13 +20,42 @@ struct XmlError : public std::exception } }; -using HtmlParserCtxtPtr = Managed; -using HtmlDocPtr = Managed; using XmlXPathContextPtr = Managed; using XmlXPathObjectPtr = Managed; static inline void XmlFree(void* p) { xmlFree(p); } using XmlCharPtr = Managed; +struct ParsedDoc +{ + using InternetHandle = Managed; + using HtmlParserCtxtPtr = Managed; + using HtmlDocPtr = Managed; + + InternetHandle hi; + InternetHandle hiUrl; + HtmlParserCtxtPtr ctxt; + char buf[1024]; + + ParsedDoc(const wchar_t* wszUrl, const char* szUrl) + : hi(InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0)), + hiUrl(InternetOpenUrl(hi, wszUrl, nullptr, 0, INTERNET_FLAG_NO_UI, 0)), + ctxt(htmlCreatePushParserCtxt(nullptr, nullptr, buf, sizeof(buf), szUrl, XML_CHAR_ENCODING_UTF8)) + { + htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); + + BOOL r; + DWORD cbRead; + while (r = InternetReadFile(hiUrl, buf, sizeof(buf), &cbRead), cbRead) { + if (!r) + throw InternetError(); + if (!htmlParseChunk(ctxt, buf, cbRead, 0)) + throw XmlError(); + } + htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ + } + + operator htmlDocPtr() { return ctxt->myDoc; } +}; template bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept { @@ -54,37 +83,15 @@ bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept void FetchData() { - LIBXML_TEST_VERSION; - using InternetHandle = Managed; - /* The remote data is retrieved using WinINet from the * Detective Conan World wiki. Using libxml2's "push parser", - * the HTML is parsed piece by piece as it is retrieved. */ - - InternetHandle hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0); - InternetHandle hiUrl = InternetOpenUrl(hi, L"https://www.detectiveconanworld.com/wiki/Anime", - nullptr, 0, INTERNET_FLAG_NO_UI, 0); //L"file://C:/Users/John/Desktop/dcw.html" - - char buf[1024]; - HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, - buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", - XML_CHAR_ENCODING_UTF8); - htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); - - BOOL r; - DWORD cbRead; - while (r = InternetReadFile(hiUrl, buf, sizeof(buf), &cbRead), cbRead) { - if (!r) - throw InternetError(); - if (!htmlParseChunk(ctxt, buf, cbRead, 0)) - throw std::runtime_error(xmlGetLastError()->message); - } - htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ + * the HTML is parsed piece by piece as it is retrieved. The + * episode data are contained in table rows matching a (very!) + * specific XPath query. This is fragile theoretically, but + * unlikely to break practically. */ - /* The episode data are contained in table rows matching a - * (very!) specific XPath query. This is fragile - * theoretically, but unlikely to break practically. */ - HtmlDocPtr doc = ctxt->myDoc; + ParsedDoc doc(L"https://www.detectiveconanworld.com/wiki/Anime", + "https://www.detectiveconanworld.com/wiki/Anime"); XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( reinterpret_cast("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"), diff --git a/c/main.cpp b/c/main.cpp index 8dc5eab..771b323 100644 --- a/c/main.cpp +++ b/c/main.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "debug.h" #include "resource.h" @@ -86,6 +87,7 @@ static void UpdateTheme(); int WINAPI WinMain(const HINSTANCE hInstance, const HINSTANCE, char* const, const int nCmdShow) { setbuf(stdout, nullptr); + LIBXML_TEST_VERSION; /* Initialize Prolog. */ const char* argv[] = {"EpisodeBrowser", nullptr}; -- cgit v1.2.3