diff options
author | John Ankarström <john@ankarstrom.se> | 2022-08-23 18:52:54 +0200 |
---|---|---|
committer | John Ankarström <john@ankarstrom.se> | 2022-08-23 18:52:54 +0200 |
commit | 6248b63041d8d5e8ae5b15fe58ccde897b3cfb6e (patch) | |
tree | 7ffd1fffc879e4993cb27d4761feb8d8b7e9e84f /c/data.cpp | |
parent | e78f83c394bc1257d86ba8a9c8009f77cf32f94f (diff) | |
download | EpisodeBrowser-6248b63041d8d5e8ae5b15fe58ccde897b3cfb6e.tar.gz |
Refactor HTML retrieval and parsing.
Diffstat (limited to 'c/data.cpp')
-rw-r--r-- | c/data.cpp | 67 |
1 files changed, 37 insertions, 30 deletions
@@ -20,13 +20,42 @@ struct XmlError : public std::exception } }; -using HtmlParserCtxtPtr = Managed<htmlParserCtxtPtr, xmlFreeParserCtxt, XmlError>; -using HtmlDocPtr = Managed<htmlDocPtr, xmlFreeDoc, XmlError>; using XmlXPathContextPtr = Managed<xmlXPathContextPtr, xmlXPathFreeContext, XmlError>; using XmlXPathObjectPtr = Managed<xmlXPathObjectPtr, xmlXPathFreeObject, XmlError>; static inline void XmlFree(void* p) { xmlFree(p); } using XmlCharPtr = Managed<xmlChar*, XmlFree, XmlError>; +struct ParsedDoc +{ + using InternetHandle = Managed<HINTERNET, InternetCloseHandle, InternetError>; + using HtmlParserCtxtPtr = Managed<htmlParserCtxtPtr, xmlFreeParserCtxt, XmlError>; + using HtmlDocPtr = Managed<htmlDocPtr, xmlFreeDoc, XmlError>; + + InternetHandle hi; + InternetHandle hiUrl; + HtmlParserCtxtPtr ctxt; + char buf[1024]; + + ParsedDoc(const wchar_t* wszUrl, const char* szUrl) + : hi(InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0)), + hiUrl(InternetOpenUrl(hi, wszUrl, nullptr, 0, INTERNET_FLAG_NO_UI, 0)), + ctxt(htmlCreatePushParserCtxt(nullptr, nullptr, buf, sizeof(buf), szUrl, XML_CHAR_ENCODING_UTF8)) + { + htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); + + BOOL r; + DWORD cbRead; + while (r = InternetReadFile(hiUrl, buf, sizeof(buf), &cbRead), cbRead) { + if (!r) + throw InternetError(); + if (!htmlParseChunk(ctxt, buf, cbRead, 0)) + throw XmlError(); + } + htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ + } + + operator htmlDocPtr() { return ctxt->myDoc; } +}; template <size_t N> bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept { @@ -54,37 +83,15 @@ bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept void FetchData() { - LIBXML_TEST_VERSION; - using InternetHandle = Managed<HINTERNET, InternetCloseHandle, InternetError>; - /* The remote data is retrieved using WinINet from the * Detective Conan World wiki. Using libxml2's "push parser", - * the HTML is parsed piece by piece as it is retrieved. */ - - InternetHandle hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0); - InternetHandle hiUrl = InternetOpenUrl(hi, L"https://www.detectiveconanworld.com/wiki/Anime", - nullptr, 0, INTERNET_FLAG_NO_UI, 0); //L"file://C:/Users/John/Desktop/dcw.html" - - char buf[1024]; - HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, - buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", - XML_CHAR_ENCODING_UTF8); - htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); - - BOOL r; - DWORD cbRead; - while (r = InternetReadFile(hiUrl, buf, sizeof(buf), &cbRead), cbRead) { - if (!r) - throw InternetError(); - if (!htmlParseChunk(ctxt, buf, cbRead, 0)) - throw std::runtime_error(xmlGetLastError()->message); - } - htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ + * the HTML is parsed piece by piece as it is retrieved. The + * episode data are contained in table rows matching a (very!) + * specific XPath query. This is fragile theoretically, but + * unlikely to break practically. */ - /* The episode data are contained in table rows matching a - * (very!) specific XPath query. This is fragile - * theoretically, but unlikely to break practically. */ - HtmlDocPtr doc = ctxt->myDoc; + ParsedDoc doc(L"https://www.detectiveconanworld.com/wiki/Anime", + "https://www.detectiveconanworld.com/wiki/Anime"); XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( reinterpret_cast<const xmlChar*>("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"), |