#include #include #include #include #include #include #include "data.h" struct InternetFile { InternetFile(const wchar_t* url); ~InternetFile(); DWORD Read(void* buf, DWORD cb); HINTERNET hi; HINTERNET hiUrl; }; InternetFile::InternetFile(const wchar_t* url) { hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, /*INTERNET_FLAG_ASYNC*/0); if (!hi) throw Win32Error{}; hiUrl = InternetOpenUrl(hi, url, nullptr, 0, INTERNET_FLAG_NO_UI, 0); if (!hiUrl) { InternetCloseHandle(hi); throw Win32Error{}; } } InternetFile::~InternetFile() { InternetCloseHandle(hiUrl); InternetCloseHandle(hi); } DWORD InternetFile::Read(void* buf, DWORD cb) { DWORD cbRead; if (InternetReadFile(hiUrl, buf, cb, &cbRead)) return cbRead; else throw Win32Error{}; } template struct XmlPtr { XmlPtr(T v) : v(v) { if (!v) throw std::runtime_error(xmlGetLastError()->message); } ~XmlPtr() { F(v); } operator T() { return v; } T operator ->() { return v; } private: T v; }; using HtmlParserCtxtPtr = XmlPtr; using HtmlDocPtr = XmlPtr; using XmlXPathContextPtr = XmlPtr; using XmlXPathObjectPtr = XmlPtr; static inline void XmlFree(void* p) { xmlFree(p); } using XmlCharPtr = XmlPtr; template bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) { /* Truncate if source is larger than destination. */ int lenUtf8 = xmlStrlen(utf8); utf8[std::min(N, static_cast(lenUtf8))] = 0; /* Convert internal representation from UTF-8 to Latin-1, * which seems to actually convert the string to proper UTF-8 * (???). */ unsigned char lat1[N]; int lenLat1 = N-1; if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0) return false; lat1[lenLat1] = 0; /* Write wide string to destination, if it fits. */ char* const src = reinterpret_cast(lat1); const int cchNarrow = lenLat1+1; const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0); if (static_cast(cchWide) > N) return false; return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide); } void FetchData(bool* bDone) { LIBXML_TEST_VERSION; InternetFile inf{L"https://www.detectiveconanworld.com/wiki/Anime"}; //InternetFile inf{L"file://C:/Users/John/Desktop/dcw.html"}; char buf[1024]; HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", XML_CHAR_ENCODING_UTF8); htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) { if (!htmlParseChunk(ctxt, buf, cbRead, 0)) throw std::runtime_error(xmlGetLastError()->message); } htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ /* Find table rows containing episode data. */ HtmlDocPtr doc = ctxt->myDoc; XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( reinterpret_cast("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"), xpathCtx); xmlNodeSetPtr nodes = xpathObj->nodesetval; int cNodes = nodes? nodes->nodeNr: 0; printf("%d nodes\n", cNodes); for (int i = 0; i < cNodes; i++) { extern FileView g_fvElv; extern FileView g_fvDlv; const xmlNodePtr node = nodes->nodeTab[i]; if (xmlChildElementCount(node) != 8) throw std::runtime_error("unexpected remote data format"); ElvDataA& e = g_fvElv.At(i); DlvDataA& d = g_fvDlv.At(i); /* Get cells. */ const xmlNodePtr nodeEp = xmlFirstElementChild(node); const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp)); const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle); const xmlNodePtr nodeSource = xmlNextElementSibling( xmlNextElementSibling(xmlNextElementSibling(nodeDate))); const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource); WcharsFromXmlchars(e.siEp, xmlNodeGetContent(nodeEp)); e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */ WcharsFromXmlchars(e.title, xmlNodeGetContent(nodeTitle)); WcharsFromXmlchars(d.date, xmlNodeGetContent(nodeDate)); WcharsFromXmlchars(d.source, xmlNodeGetContent(nodeSource)); WcharsFromXmlchars(d.hint, xmlNodeGetContent(nodeHint)); /* Get wiki URL. */ const xmlNodePtr nodeLink = xmlFirstElementChild(nodeTitle); if (nodeLink) WcharsFromXmlchars(d.wiki, xmlGetProp(nodeLink, (const xmlChar*)"href")); } *bDone = true; }