From 6832f91958e6c2cc44a8c4a4e126ea20b8c8d5a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?John=20Ankarstr=C3=B6m?= Date: Tue, 30 Aug 2022 02:19:56 +0200 Subject: Fix Unique Good, Bad. Simplify HTML parser. Good and Bad should obviously do the exact opposite thing to what they did. --- c/data.cpp | 76 +++++++++++++++++++++++++++++--------------------------------- 1 file changed, 36 insertions(+), 40 deletions(-) (limited to 'c/data.cpp') diff --git a/c/data.cpp b/c/data.cpp index df5627a..7ae6ad1 100644 --- a/c/data.cpp +++ b/c/data.cpp @@ -14,50 +14,44 @@ static Unique s_hi = InternetOpenW(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0); -/* ParsedDoc downloads and parses an HTML document. */ -struct ParsedDoc +Unique RemoteParserCtxt(const wchar_t* wszUrl, const char* szUrl) { - Unique hiUrl; - Unique ctx; - char bufI[1024]; + if (s_hi.Bad(0)) + throw Win32Error(); + + Unique hiUrl = InternetOpenUrlW( + s_hi.v, wszUrl, nullptr, 0, INTERNET_FLAG_NO_UI, 0); + if (hiUrl.Bad(0)) + throw InternetError(); + char bufX[1024]; + Unique ctx = htmlCreatePushParserCtxt( + nullptr, nullptr, bufX, sizeof(bufX), szUrl, XML_CHAR_ENCODING_UTF8); + if (ctx.Bad(0)) + throw XmlError(); - ParsedDoc(const wchar_t* wszUrl, const char* szUrl) - { - if (!s_hi.Bad(0)) - throw Win32Error(); + htmlCtxtUseOptions(ctx.v, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); - hiUrl = InternetOpenUrlW(s_hi.v, wszUrl, nullptr, 0, INTERNET_FLAG_NO_UI, 0); - if (!hiUrl.Bad(0)) + BOOL r; + DWORD cbRead; + char bufI[1024]; + while (r = InternetReadFile(hiUrl.v, bufI, sizeof(bufI), &cbRead), cbRead) { + if (!r) throw InternetError(); - - ctx = htmlCreatePushParserCtxt(nullptr, nullptr, bufX, sizeof(bufX), szUrl, XML_CHAR_ENCODING_UTF8); - if (!ctx.Bad(0)) + if (!htmlParseChunk(ctx.v, bufI, cbRead, 0)) throw XmlError(); - - htmlCtxtUseOptions(ctx.v, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); - - BOOL r; - DWORD cbRead; - while (r = InternetReadFile(hiUrl.v, bufI, sizeof(bufI), &cbRead), cbRead) { - if (!r) - throw InternetError(); - if (!htmlParseChunk(ctx.v, bufI, cbRead, 0)) - throw XmlError(); - } - htmlParseChunk(ctx.v, bufI, 0, 1); /* Stop parsing. */ } + htmlParseChunk(ctx.v, bufI, 0, 1); /* Stop parsing. */ - operator htmlDocPtr() { return ctx.v->myDoc; } -}; + return ctx; +} static inline void XmlFree(void* p) { xmlFree(p); } template -bool WcharsFromXmlchars(wchar_t (&dst)[N], xmlChar* utf8_) +bool WcharsFromXmlchars(wchar_t (&dst)[N], Unique utf8) { - Unique utf8 = utf8_; - if (!utf8.Bad(0)) + if (utf8.Bad(0)) throw XmlError(); /* Truncate if source is larger than destination. */ @@ -167,17 +161,18 @@ void FetchData(unsigned char* sig) * specific XPath query. This is fragile theoretically, but * unlikely to break practically. */ - ParsedDoc doc(L"https://www.detectiveconanworld.com/wiki/Anime", - "https://www.detectiveconanworld.com/wiki/Anime"); + Unique ctx = + RemoteParserCtxt(L"https://www.detectiveconanworld.com/wiki/Anime", + "https://www.detectiveconanworld.com/wiki/Anime"); - Unique xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx.Bad(0)) + Unique xpathCtx = xmlXPathNewContext(ctx.v->myDoc); + if (xpathCtx.Bad(0)) throw XmlError(); Unique xpathObj = xmlXPathEvalExpression( reinterpret_cast("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"), xpathCtx.v); - if (!xpathObj.Bad(0)) + if (xpathObj.Bad(0)) throw XmlError(); xmlNodeSetPtr nodes = xpathObj.v->nodesetval; @@ -269,16 +264,17 @@ void FetchScreenwriters(unsigned char* sig) Wcscpy(Buf(url)+Len(prefix), d.wiki); /* Retrieve screenwriter from HTML. */ - ParsedDoc doc(url, nullptr); - Unique xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx.Bad(0)) + + Unique ctx = RemoteParserCtxt(url, nullptr); + Unique xpathCtx = xmlXPathNewContext(ctx.v->myDoc); + if (xpathCtx.Bad(0)) throw XmlError(); Unique xpathObj = xmlXPathEvalExpression(reinterpret_cast( "//th[contains(text(), 'Screenplay:')]/following-sibling::td"), xpathCtx.v); - if (!xpathObj.Bad(0)) + if (xpathObj.Bad(0)) throw XmlError(); xmlNodeSetPtr nodes = xpathObj.v->nodesetval; -- cgit v1.2.3