#include #include #include #include #include #include "data.h" #include "win.h" struct XmlError : public std::exception { const char* msg; XmlError() { msg = xmlGetLastError()->message; } virtual const char* what() const noexcept { return msg; } }; using HtmlParserCtxtPtr = Managed; using HtmlDocPtr = Managed; using XmlXPathContextPtr = Managed; using XmlXPathObjectPtr = Managed; static inline void XmlFree(void* p) { xmlFree(p); } using XmlCharPtr = Managed; template bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept { /* Truncate if source is larger than destination. */ int lenUtf8 = xmlStrlen(utf8); utf8[Min(N, static_cast(lenUtf8))] = 0; /* Convert internal representation from UTF-8 to Latin-1, * which seems to actually convert the string to proper UTF-8 * (???). */ unsigned char lat1[N]; int lenLat1 = N-1; if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0) return false; lat1[lenLat1] = 0; /* Write wide string to destination, if it fits. */ char* const src = reinterpret_cast(lat1); const int cchNarrow = lenLat1+1; const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0); if (static_cast(cchWide) > N) return false; return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide); } void FetchData() { LIBXML_TEST_VERSION; using InternetHandle = Managed; /* The remote data is retrieved using WinINet from the * Detective Conan World wiki. Using libxml2's "push parser", * the HTML is parsed piece by piece as it is retrieved. */ InternetHandle hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0); InternetHandle hiUrl = InternetOpenUrl(hi, L"https://www.detectiveconanworld.com/wiki/Anime", nullptr, 0, INTERNET_FLAG_NO_UI, 0); //L"file://C:/Users/John/Desktop/dcw.html" char buf[1024]; HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", XML_CHAR_ENCODING_UTF8); htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); BOOL r; DWORD cbRead; while (r = InternetReadFile(hiUrl, buf, sizeof(buf), &cbRead), cbRead) { if (!r) throw InternetError(); if (!htmlParseChunk(ctxt, buf, cbRead, 0)) throw std::runtime_error(xmlGetLastError()->message); } htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ /* The episode data are contained in table rows matching a * (very!) specific XPath query. This is fragile * theoretically, but unlikely to break practically. */ HtmlDocPtr doc = ctxt->myDoc; XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( reinterpret_cast("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"), xpathCtx); xmlNodeSetPtr nodes = xpathObj->nodesetval; int cNodes = nodes? nodes->nodeNr: 0; if (!cNodes) throw std::runtime_error("could not find remote episode information"); for (int i = 0; i < cNodes; i++) { extern FileView g_fvElv; extern FileView g_fvDlv; const xmlNodePtr node = nodes->nodeTab[i]; if (xmlChildElementCount(node) != 8) throw std::runtime_error("unexpected remote data format"); ElvDataA& e = g_fvElv.At(i); DlvDataA& d = g_fvDlv.At(i); /* Each datum is contained within a specific cell in * the row. The child element count above ensures that * none of the following nodes are null. */ const xmlNodePtr nodeEp = xmlFirstElementChild(node); const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp)); const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle); const xmlNodePtr nodeSource = xmlNextElementSibling( xmlNextElementSibling(xmlNextElementSibling(nodeDate))); const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource); WcharsFromXmlchars(d.date, xmlNodeGetContent(nodeDate)); WcharsFromXmlchars(d.source, xmlNodeGetContent(nodeSource)); WcharsFromXmlchars(d.hint, xmlNodeGetContent(nodeHint)); e.bTVOriginal = wcsncmp(d.source, L"TV", 2) == 0? 1: 0; WcharsFromXmlchars(e.siEp, xmlNodeGetContent(nodeEp)); e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove potential "WPS" suffix. */ WcharsFromXmlchars(e.title, xmlNodeGetContent(nodeTitle)); /* Retrieve the link to the episode's wiki entry, * which should be the first (and only) child element * of the title node. */ const xmlNodePtr nodeLink = xmlFirstElementChild(nodeTitle); if (nodeLink) WcharsFromXmlchars(d.wiki, xmlGetProp(nodeLink, reinterpret_cast("href"))); } } void WaitFetchData(bool* bDone) noexcept { try { FetchData(); *bDone = true; } catch (...) { *bDone = true; ShowException(L"Remote data could not be fetched due to an error: %s", L"Error", MB_ICONWARNING); } }