From 593757d3d178e7f94376241a70ac29fe2bf94469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?John=20Ankarstr=C3=B6m?= Date: Sun, 21 Aug 2022 00:47:54 +0200 Subject: Rework remote data fetching. --- c/data.cpp | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ c/test.cpp | 117 +++++------------------------------------------ 2 files changed, 163 insertions(+), 106 deletions(-) (limited to 'c') diff --git a/c/data.cpp b/c/data.cpp index 9dd6ef8..1e59b8b 100644 --- a/c/data.cpp +++ b/c/data.cpp @@ -1 +1,153 @@ +#include +#include +#include +#include +#include + #include "data.h" + +struct InternetFile +{ + InternetFile(const wchar_t* url); + ~InternetFile(); + DWORD Read(void* buf, DWORD cb); + HINTERNET hi; + HINTERNET hiUrl; +}; + +InternetFile::InternetFile(const wchar_t* url) +{ + hi = InternetOpen(L"Episode Browser", + INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, + /*INTERNET_FLAG_ASYNC*/0); + if (!hi) + throw Win32Error{}; + + hiUrl = InternetOpenUrl(hi, url, + nullptr, 0, INTERNET_FLAG_NO_UI, 0); + if (!hiUrl) { + InternetCloseHandle(hi); + throw Win32Error{}; + } +} + +InternetFile::~InternetFile() +{ + InternetCloseHandle(hiUrl); + InternetCloseHandle(hi); +} + +DWORD InternetFile::Read(void* buf, DWORD cb) +{ + DWORD cbRead; + if (InternetReadFile(hiUrl, buf, cb, &cbRead)) + return cbRead; + else + throw Win32Error{}; +} + +template +struct XmlPtr +{ + XmlPtr(T v) : v(v) + { + if (!v) + throw std::runtime_error(xmlGetLastError()->message); + } + ~XmlPtr() { F(v); } + operator T() { return v; } + T operator ->() { return v; } +private: + T v; +}; + +using HtmlParserCtxtPtr = XmlPtr; +using HtmlDocPtr = XmlPtr; +using XmlXPathContextPtr = XmlPtr; +using XmlXPathObjectPtr = XmlPtr; +static inline void XmlFree(void* p) { xmlFree(p); } +using XmlCharPtr = XmlPtr; + +template +bool WriteNodeContent(wchar_t (&dst)[N], const xmlNodePtr node) +{ + XmlCharPtr utf8 = xmlNodeGetContent(node); + if (!utf8) + return false; + + /* Truncate if source is larger than destination. */ + utf8[N] = 0; + int lenUtf8 = xmlStrlen(utf8); + + /* Convert internal representation from UTF-8 to Latin-1, + * which seems to actually convert the string to proper UTF-8 + * (???). */ + unsigned char lat1[N]; + int lenLat1 = N-1; + if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0) + return false; + lat1[lenLat1] = 0; + + /* Write wide string to destination, if it fits. */ + char* const src = reinterpret_cast(lat1); + const int cchNarrow = lenLat1+1; + const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0); + if (static_cast(cchWide) > N) + return false; + return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide); +} + +void FetchData(FileView& fvElv, FileView& fvDlv) +{ + LIBXML_TEST_VERSION; + + //InternetFile inf{L"https://www.detectiveconanworld.com/wiki/Anime"}; + InternetFile inf{L"file://C:/Users/John/Desktop/dcw.html"}; + char buf[1024]; + + HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, + buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", + XML_CHAR_ENCODING_UTF8); + + htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); + + while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) { + if (!htmlParseChunk(ctxt, buf, cbRead, 0)) + throw std::runtime_error(xmlGetLastError()->message); + } + htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ + + /* Find table rows containing episode data. */ + HtmlDocPtr doc = ctxt->myDoc; + XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( + reinterpret_cast("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"), + xpathCtx); + xmlNodeSetPtr nodes = xpathObj->nodesetval; + int cNodes = nodes? nodes->nodeNr: 0; + + printf("%d nodes\n", cNodes); + for (int i = 0; i < cNodes; i++) { + const xmlNodePtr node = nodes->nodeTab[i]; + if (xmlChildElementCount(node) != 8) + throw std::runtime_error("unexpected remote data format"); + + /* Get cells. */ + const xmlNodePtr nodeEp = xmlFirstElementChild(node); + const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp)); + const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle); + const xmlNodePtr nodeSource = xmlNextElementSibling( + xmlNextElementSibling(xmlNextElementSibling(nodeDate))); + const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource); + + ElvDataA& e = fvElv.At(i); + DlvDataA& d = fvDlv.At(i); + + WriteNodeContent(e.siEp, nodeEp); + e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */ + WriteNodeContent(e.title, nodeTitle); + WriteNodeContent(d.date, nodeDate); + WriteNodeContent(d.source, nodeSource); + WriteNodeContent(d.hint, nodeHint); + } +} diff --git a/c/test.cpp b/c/test.cpp index b125a20..5bd8958 100644 --- a/c/test.cpp +++ b/c/test.cpp @@ -1,8 +1,4 @@ #include -#include -#include -#include -#include #include "data.h" #include "episodelistview.h" @@ -25,56 +21,6 @@ struct Test #define TEST(id) }; struct id : public Test { id() : Test(#id) #define FAIL(...) do { Sprintf(error, __VA_ARGS__); return; } while (0) -template -struct Defer -{ - Defer(F dtor) : dtor(dtor) {} - ~Defer() { dtor(); } - F dtor; -}; - -#define DEFER(x) Defer APPLY(CAT, defer_, __COUNTER__) {[=](){x;}} - -struct InternetFile -{ - InternetFile(const wchar_t* url); - ~InternetFile(); - DWORD Read(void* buf, DWORD cb); - HINTERNET hi; - HINTERNET hiUrl; -}; - -InternetFile::InternetFile(const wchar_t* url) -{ - hi = InternetOpen(L"Episode Browser", - INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, - /*INTERNET_FLAG_ASYNC*/0); - if (!hi) - throw Win32Error{}; - - hiUrl = InternetOpenUrl(hi, url, - nullptr, 0, INTERNET_FLAG_NO_UI, 0); - if (!hiUrl) { - InternetCloseHandle(hi); - throw Win32Error{}; - } -} - -InternetFile::~InternetFile() -{ - InternetCloseHandle(hiUrl); - InternetCloseHandle(hi); -} - -DWORD InternetFile::Read(void* buf, DWORD cb) -{ - DWORD cbRead; - if (InternetReadFile(hiUrl, buf, cb, &cbRead)) - return cbRead; - else - throw Win32Error{}; -} - TESTS { TEST(StrcpyWithSmallerDestination) @@ -252,7 +198,7 @@ TESTS //DeleteFile(L"tmp.dat"); } - TEST(DownloadData) + TEST(DownloadDataViaProlog) { WcharPtr title, wiki, date, source, hint; int i = 1053; @@ -276,60 +222,21 @@ TESTS g_cfg.cEp = i; } - TEST(XML) + TEST(Fetch) { - LIBXML_TEST_VERSION; - - InternetFile inf{L"https://www.detectiveconanworld.com/wiki/Anime"}; - char buf[1024]; - - htmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, - buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", - XML_CHAR_ENCODING_UTF8); - DEFER(xmlFreeParserCtxt(ctxt)); - - htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); - - while (DWORD cbRead = inf.Read(&buf, sizeof(buf))) { - if (!htmlParseChunk(ctxt, buf, cbRead, 0)) - FAIL(xmlGetLastError()->message); - } - htmlParseChunk(ctxt, buf, 0, 1); /* Terminate. */ - - htmlDocPtr doc = ctxt->myDoc; - if (!doc) - FAIL(xmlGetLastError()->message); - DEFER(xmlFreeDoc(doc)); /* Needed? */ - - xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); - if (!xpathCtx) - FAIL(xmlGetLastError()->message); - DEFER(xmlXPathFreeContext(xpathCtx)); - - xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( - reinterpret_cast("//tr/td[@style='background:#f2fde9;']"), - xpathCtx); - if (!xpathObj) - FAIL(xmlGetLastError()->message); - DEFER(xmlXPathFreeObject(xpathObj)); - - xmlNodeSetPtr nodes; - int cNodes; - nodes = xpathObj->nodesetval; - cNodes = nodes? nodes->nodeNr: 0; - - printf("%d nodes\n", cNodes); - // for (int i = 0; i < cNodes; i++) { - // xmlNodePtr node = nodes->nodeTab[i]; - // printf("node \"%s\": type %d\n", node->name, node->type); - // } + extern FileView g_fvElv; + extern FileView g_fvDlv; + void FetchData(FileView& fvElv, FileView& fvDlv); + //FileView fvElv{L"testelv.dat", 1080}; + //FileView fvDlv{L"testdlv.dat", 1080}; + FetchData(g_fvElv, g_fvDlv); } }; int RunTests() { const Test tests[] = { - //StrcpyWithSmallerDestination{}, + StrcpyWithSmallerDestination{}, //EpisodeDataFromWeb{}, //EpisodeDataFromProlog{}, //IO{}, @@ -337,10 +244,8 @@ int RunTests() //SampleConfigurationToDisk{}, //MigrateCfg{} //MigrateDlvDataFromPrologToDisk{}, - //DownloadData{}, - XML{}, - //ImportElvData{}, - //ImportDlvData{}, + //DownloadDataViaProlog{}, + Fetch{}, }; printf("Results (%llu tests):\n", sizeof(tests)/sizeof(*tests)); -- cgit v1.2.3