diff options
author | John Ankarström <john@ankarstrom.se> | 2022-08-30 02:19:56 +0200 |
---|---|---|
committer | John Ankarström <john@ankarstrom.se> | 2022-08-30 02:20:23 +0200 |
commit | 6832f91958e6c2cc44a8c4a4e126ea20b8c8d5a3 (patch) | |
tree | 4f08e1f31429e2e716fa618f550342287a091a0e /c/data.cpp | |
parent | 16ba8f3ae15363e921fca0e97f5c89cf12987b21 (diff) | |
download | EpisodeBrowser-6832f91958e6c2cc44a8c4a4e126ea20b8c8d5a3.tar.gz |
Fix Unique Good, Bad. Simplify HTML parser.
Good and Bad should obviously do the exact opposite thing
to what they did.
Diffstat (limited to 'c/data.cpp')
-rw-r--r-- | c/data.cpp | 76 |
1 files changed, 36 insertions, 40 deletions
@@ -14,50 +14,44 @@ static Unique<HINTERNET, InternetCloseHandle> s_hi = InternetOpenW(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0); -/* ParsedDoc downloads and parses an HTML document. */ -struct ParsedDoc +Unique<htmlParserCtxtPtr, xmlFreeParserCtxt> RemoteParserCtxt(const wchar_t* wszUrl, const char* szUrl) { - Unique<HINTERNET, InternetCloseHandle> hiUrl; - Unique<htmlParserCtxtPtr, xmlFreeParserCtxt> ctx; - char bufI[1024]; + if (s_hi.Bad(0)) + throw Win32Error(); + + Unique<HINTERNET, InternetCloseHandle> hiUrl = InternetOpenUrlW( + s_hi.v, wszUrl, nullptr, 0, INTERNET_FLAG_NO_UI, 0); + if (hiUrl.Bad(0)) + throw InternetError(); + char bufX[1024]; + Unique<htmlParserCtxtPtr, xmlFreeParserCtxt> ctx = htmlCreatePushParserCtxt( + nullptr, nullptr, bufX, sizeof(bufX), szUrl, XML_CHAR_ENCODING_UTF8); + if (ctx.Bad(0)) + throw XmlError(); - ParsedDoc(const wchar_t* wszUrl, const char* szUrl) - { - if (!s_hi.Bad(0)) - throw Win32Error(); + htmlCtxtUseOptions(ctx.v, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); - hiUrl = InternetOpenUrlW(s_hi.v, wszUrl, nullptr, 0, INTERNET_FLAG_NO_UI, 0); - if (!hiUrl.Bad(0)) + BOOL r; + DWORD cbRead; + char bufI[1024]; + while (r = InternetReadFile(hiUrl.v, bufI, sizeof(bufI), &cbRead), cbRead) { + if (!r) throw InternetError(); - - ctx = htmlCreatePushParserCtxt(nullptr, nullptr, bufX, sizeof(bufX), szUrl, XML_CHAR_ENCODING_UTF8); - if (!ctx.Bad(0)) + if (!htmlParseChunk(ctx.v, bufI, cbRead, 0)) throw XmlError(); - - htmlCtxtUseOptions(ctx.v, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); - - BOOL r; - DWORD cbRead; - while (r = InternetReadFile(hiUrl.v, bufI, sizeof(bufI), &cbRead), cbRead) { - if (!r) - throw InternetError(); - if (!htmlParseChunk(ctx.v, bufI, cbRead, 0)) - throw XmlError(); - } - htmlParseChunk(ctx.v, bufI, 0, 1); /* Stop parsing. */ } + htmlParseChunk(ctx.v, bufI, 0, 1); /* Stop parsing. */ - operator htmlDocPtr() { return ctx.v->myDoc; } -}; + return ctx; +} static inline void XmlFree(void* p) { xmlFree(p); } template <size_t N> -bool WcharsFromXmlchars(wchar_t (&dst)[N], xmlChar* utf8_) +bool WcharsFromXmlchars(wchar_t (&dst)[N], Unique<xmlChar*, XmlFree> utf8) { - Unique<xmlChar*, XmlFree> utf8 = utf8_; - if (!utf8.Bad(0)) + if (utf8.Bad(0)) throw XmlError(); /* Truncate if source is larger than destination. */ @@ -167,17 +161,18 @@ void FetchData(unsigned char* sig) * specific XPath query. This is fragile theoretically, but * unlikely to break practically. */ - ParsedDoc doc(L"https://www.detectiveconanworld.com/wiki/Anime", - "https://www.detectiveconanworld.com/wiki/Anime"); + Unique<htmlParserCtxtPtr, xmlFreeParserCtxt> ctx = + RemoteParserCtxt(L"https://www.detectiveconanworld.com/wiki/Anime", + "https://www.detectiveconanworld.com/wiki/Anime"); - Unique<xmlXPathContextPtr, xmlXPathFreeContext> xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx.Bad(0)) + Unique<xmlXPathContextPtr, xmlXPathFreeContext> xpathCtx = xmlXPathNewContext(ctx.v->myDoc); + if (xpathCtx.Bad(0)) throw XmlError(); Unique<xmlXPathObjectPtr, xmlXPathFreeObject> xpathObj = xmlXPathEvalExpression( reinterpret_cast<const xmlChar*>("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"), xpathCtx.v); - if (!xpathObj.Bad(0)) + if (xpathObj.Bad(0)) throw XmlError(); xmlNodeSetPtr nodes = xpathObj.v->nodesetval; @@ -269,16 +264,17 @@ void FetchScreenwriters(unsigned char* sig) Wcscpy(Buf(url)+Len(prefix), d.wiki); /* Retrieve screenwriter from HTML. */ - ParsedDoc doc(url, nullptr); - Unique<xmlXPathContextPtr, xmlXPathFreeContext> xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx.Bad(0)) + + Unique<htmlParserCtxtPtr, xmlFreeParserCtxt> ctx = RemoteParserCtxt(url, nullptr); + Unique<xmlXPathContextPtr, xmlXPathFreeContext> xpathCtx = xmlXPathNewContext(ctx.v->myDoc); + if (xpathCtx.Bad(0)) throw XmlError(); Unique<xmlXPathObjectPtr, xmlXPathFreeObject> xpathObj = xmlXPathEvalExpression(reinterpret_cast<const xmlChar*>( "//th[contains(text(), 'Screenplay:')]/following-sibling::td"), xpathCtx.v); - if (!xpathObj.Bad(0)) + if (xpathObj.Bad(0)) throw XmlError(); xmlNodeSetPtr nodes = xpathObj.v->nodesetval; |