From 593757d3d178e7f94376241a70ac29fe2bf94469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?John=20Ankarstr=C3=B6m?= Date: Sun, 21 Aug 2022 00:47:54 +0200 Subject: Rework remote data fetching. --- c/data.cpp | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) (limited to 'c/data.cpp') diff --git a/c/data.cpp b/c/data.cpp index 9dd6ef8..1e59b8b 100644 --- a/c/data.cpp +++ b/c/data.cpp @@ -1 +1,153 @@ +#include +#include +#include +#include +#include + #include "data.h" + +struct InternetFile +{ + InternetFile(const wchar_t* url); + ~InternetFile(); + DWORD Read(void* buf, DWORD cb); + HINTERNET hi; + HINTERNET hiUrl; +}; + +InternetFile::InternetFile(const wchar_t* url) +{ + hi = InternetOpen(L"Episode Browser", + INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, + /*INTERNET_FLAG_ASYNC*/0); + if (!hi) + throw Win32Error{}; + + hiUrl = InternetOpenUrl(hi, url, + nullptr, 0, INTERNET_FLAG_NO_UI, 0); + if (!hiUrl) { + InternetCloseHandle(hi); + throw Win32Error{}; + } +} + +InternetFile::~InternetFile() +{ + InternetCloseHandle(hiUrl); + InternetCloseHandle(hi); +} + +DWORD InternetFile::Read(void* buf, DWORD cb) +{ + DWORD cbRead; + if (InternetReadFile(hiUrl, buf, cb, &cbRead)) + return cbRead; + else + throw Win32Error{}; +} + +template +struct XmlPtr +{ + XmlPtr(T v) : v(v) + { + if (!v) + throw std::runtime_error(xmlGetLastError()->message); + } + ~XmlPtr() { F(v); } + operator T() { return v; } + T operator ->() { return v; } +private: + T v; +}; + +using HtmlParserCtxtPtr = XmlPtr; +using HtmlDocPtr = XmlPtr; +using XmlXPathContextPtr = XmlPtr; +using XmlXPathObjectPtr = XmlPtr; +static inline void XmlFree(void* p) { xmlFree(p); } +using XmlCharPtr = XmlPtr; + +template +bool WriteNodeContent(wchar_t (&dst)[N], const xmlNodePtr node) +{ + XmlCharPtr utf8 = xmlNodeGetContent(node); + if (!utf8) + return false; + + /* Truncate if source is larger than destination. */ + utf8[N] = 0; + int lenUtf8 = xmlStrlen(utf8); + + /* Convert internal representation from UTF-8 to Latin-1, + * which seems to actually convert the string to proper UTF-8 + * (???). */ + unsigned char lat1[N]; + int lenLat1 = N-1; + if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0) + return false; + lat1[lenLat1] = 0; + + /* Write wide string to destination, if it fits. */ + char* const src = reinterpret_cast(lat1); + const int cchNarrow = lenLat1+1; + const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0); + if (static_cast(cchWide) > N) + return false; + return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide); +} + +void FetchData(FileView& fvElv, FileView& fvDlv) +{ + LIBXML_TEST_VERSION; + + //InternetFile inf{L"https://www.detectiveconanworld.com/wiki/Anime"}; + InternetFile inf{L"file://C:/Users/John/Desktop/dcw.html"}; + char buf[1024]; + + HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, + buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", + XML_CHAR_ENCODING_UTF8); + + htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); + + while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) { + if (!htmlParseChunk(ctxt, buf, cbRead, 0)) + throw std::runtime_error(xmlGetLastError()->message); + } + htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ + + /* Find table rows containing episode data. */ + HtmlDocPtr doc = ctxt->myDoc; + XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( + reinterpret_cast("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"), + xpathCtx); + xmlNodeSetPtr nodes = xpathObj->nodesetval; + int cNodes = nodes? nodes->nodeNr: 0; + + printf("%d nodes\n", cNodes); + for (int i = 0; i < cNodes; i++) { + const xmlNodePtr node = nodes->nodeTab[i]; + if (xmlChildElementCount(node) != 8) + throw std::runtime_error("unexpected remote data format"); + + /* Get cells. */ + const xmlNodePtr nodeEp = xmlFirstElementChild(node); + const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp)); + const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle); + const xmlNodePtr nodeSource = xmlNextElementSibling( + xmlNextElementSibling(xmlNextElementSibling(nodeDate))); + const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource); + + ElvDataA& e = fvElv.At(i); + DlvDataA& d = fvDlv.At(i); + + WriteNodeContent(e.siEp, nodeEp); + e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */ + WriteNodeContent(e.title, nodeTitle); + WriteNodeContent(d.date, nodeDate); + WriteNodeContent(d.source, nodeSource); + WriteNodeContent(d.hint, nodeHint); + } +} -- cgit v1.2.3