#include #include #include #include #include #include "data.h" struct InternetFile { InternetFile(const wchar_t* url); ~InternetFile(); DWORD Read(void* buf, DWORD cb); HINTERNET hi; HINTERNET hiUrl; }; InternetFile::InternetFile(const wchar_t* url) { hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, /*INTERNET_FLAG_ASYNC*/0); if (!hi) throw Win32Error{}; hiUrl = InternetOpenUrl(hi, url, nullptr, 0, INTERNET_FLAG_NO_UI, 0); if (!hiUrl) { InternetCloseHandle(hi); throw Win32Error{}; } } InternetFile::~InternetFile() { InternetCloseHandle(hiUrl); InternetCloseHandle(hi); } DWORD InternetFile::Read(void* buf, DWORD cb) { DWORD cbRead; if (InternetReadFile(hiUrl, buf, cb, &cbRead)) return cbRead; else throw Win32Error{}; } template struct XmlPtr { XmlPtr(T v) : v(v) { if (!v) throw std::runtime_error(xmlGetLastError()->message); } ~XmlPtr() { F(v); } operator T() { return v; } T operator ->() { return v; } private: T v; }; using HtmlParserCtxtPtr = XmlPtr; using HtmlDocPtr = XmlPtr; using XmlXPathContextPtr = XmlPtr; using XmlXPathObjectPtr = XmlPtr; static inline void XmlFree(void* p) { xmlFree(p); } using XmlCharPtr = XmlPtr; template bool WriteNodeContent(wchar_t (&dst)[N], const xmlNodePtr node) { XmlCharPtr utf8 = xmlNodeGetContent(node); if (!utf8) return false; /* Truncate if source is larger than destination. */ utf8[N] = 0; int lenUtf8 = xmlStrlen(utf8); /* Convert internal representation from UTF-8 to Latin-1, * which seems to actually convert the string to proper UTF-8 * (???). */ unsigned char lat1[N]; int lenLat1 = N-1; if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0) return false; lat1[lenLat1] = 0; /* Write wide string to destination, if it fits. */ char* const src = reinterpret_cast(lat1); const int cchNarrow = lenLat1+1; const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0); if (static_cast(cchWide) > N) return false; return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide); } void FetchData(FileView& fvElv, FileView& fvDlv) { LIBXML_TEST_VERSION; //InternetFile inf{L"https://www.detectiveconanworld.com/wiki/Anime"}; InternetFile inf{L"file://C:/Users/John/Desktop/dcw.html"}; char buf[1024]; HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", XML_CHAR_ENCODING_UTF8); htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) { if (!htmlParseChunk(ctxt, buf, cbRead, 0)) throw std::runtime_error(xmlGetLastError()->message); } htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ /* Find table rows containing episode data. */ HtmlDocPtr doc = ctxt->myDoc; XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( reinterpret_cast("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"), xpathCtx); xmlNodeSetPtr nodes = xpathObj->nodesetval; int cNodes = nodes? nodes->nodeNr: 0; printf("%d nodes\n", cNodes); for (int i = 0; i < cNodes; i++) { const xmlNodePtr node = nodes->nodeTab[i]; if (xmlChildElementCount(node) != 8) throw std::runtime_error("unexpected remote data format"); /* Get cells. */ const xmlNodePtr nodeEp = xmlFirstElementChild(node); const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp)); const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle); const xmlNodePtr nodeSource = xmlNextElementSibling( xmlNextElementSibling(xmlNextElementSibling(nodeDate))); const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource); ElvDataA& e = fvElv.At(i); DlvDataA& d = fvDlv.At(i); WriteNodeContent(e.siEp, nodeEp); e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */ WriteNodeContent(e.title, nodeTitle); WriteNodeContent(d.date, nodeDate); WriteNodeContent(d.source, nodeSource); WriteNodeContent(d.hint, nodeHint); } }