From 593757d3d178e7f94376241a70ac29fe2bf94469 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?John=20Ankarstr=C3=B6m?= <john@ankarstrom.se>
Date: Sun, 21 Aug 2022 00:47:54 +0200
Subject: Rework remote data fetching.

---
 c/data.cpp | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 c/test.cpp | 117 +++++------------------------------------------
 2 files changed, 163 insertions(+), 106 deletions(-)

(limited to 'c')

diff --git a/c/data.cpp b/c/data.cpp
index 9dd6ef8..1e59b8b 100644
--- a/c/data.cpp
+++ b/c/data.cpp
@@ -1 +1,153 @@
+#include <windows.h>
+#include <wininet.h>
+#include <libxml/HTMLparser.h>
+#include <libxml/HTMLtree.h>
+#include <libxml/xpath.h>
+
 #include "data.h"
+
+struct InternetFile
+{
+	InternetFile(const wchar_t* url);
+	~InternetFile();
+	DWORD Read(void* buf, DWORD cb);
+	HINTERNET hi;
+	HINTERNET hiUrl;
+};
+
+InternetFile::InternetFile(const wchar_t* url)
+{
+	hi = InternetOpen(L"Episode Browser",
+	    INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr,
+	    /*INTERNET_FLAG_ASYNC*/0);
+	if (!hi)
+		throw Win32Error{};
+
+	hiUrl = InternetOpenUrl(hi, url,
+	    nullptr, 0, INTERNET_FLAG_NO_UI, 0);
+	if (!hiUrl) {
+		InternetCloseHandle(hi);
+		throw Win32Error{};
+	}
+}
+
+InternetFile::~InternetFile()
+{
+	InternetCloseHandle(hiUrl);
+	InternetCloseHandle(hi);
+}
+
+DWORD InternetFile::Read(void* buf, DWORD cb)
+{
+	DWORD cbRead;
+	if (InternetReadFile(hiUrl, buf, cb, &cbRead))
+		return cbRead;
+	else
+		throw Win32Error{};
+}
+
+template <auto F, typename T>
+struct XmlPtr
+{
+	XmlPtr(T v) : v(v)
+	{
+		if (!v)
+			throw std::runtime_error(xmlGetLastError()->message);
+	}
+	~XmlPtr() { F(v); }
+	operator T() { return v; }
+	T operator ->() { return v; }
+private:
+	T v;
+};
+
+using HtmlParserCtxtPtr = XmlPtr<xmlFreeParserCtxt, htmlParserCtxtPtr>;
+using HtmlDocPtr = XmlPtr<xmlFreeDoc, htmlDocPtr>;
+using XmlXPathContextPtr = XmlPtr<xmlXPathFreeContext, xmlXPathContextPtr>;
+using XmlXPathObjectPtr = XmlPtr<xmlXPathFreeObject, xmlXPathObjectPtr>;
+static inline void XmlFree(void* p) { xmlFree(p); }
+using XmlCharPtr = XmlPtr<XmlFree, xmlChar*>;
+
+template <size_t N>
+bool WriteNodeContent(wchar_t (&dst)[N], const xmlNodePtr node)
+{
+	XmlCharPtr utf8 = xmlNodeGetContent(node);
+	if (!utf8)
+		return false;
+
+	/* Truncate if source is larger than destination. */
+	utf8[N] = 0;
+	int lenUtf8 = xmlStrlen(utf8);
+
+	/* Convert internal representation from UTF-8 to Latin-1,
+	 * which seems to actually convert the string to proper UTF-8
+	 * (???). */
+	unsigned char lat1[N];
+	int lenLat1 = N-1;
+	if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0)
+		return false;
+	lat1[lenLat1] = 0;
+
+	/* Write wide string to destination, if it fits. */
+	char* const src = reinterpret_cast<char*>(lat1);
+	const int cchNarrow = lenLat1+1;
+	const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0);
+	if (static_cast<size_t>(cchWide) > N)
+		return false;
+	return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide);
+}
+
+void FetchData(FileView<ElvDataA>& fvElv, FileView<DlvDataA>& fvDlv)
+{
+	LIBXML_TEST_VERSION;
+
+	//InternetFile inf{L"https://www.detectiveconanworld.com/wiki/Anime"};
+	InternetFile inf{L"file://C:/Users/John/Desktop/dcw.html"};
+	char buf[1024];
+
+	HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr,
+	    buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime",
+	    XML_CHAR_ENCODING_UTF8);
+
+	htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING);
+
+	while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) {
+		if (!htmlParseChunk(ctxt, buf, cbRead, 0))
+			throw std::runtime_error(xmlGetLastError()->message);
+	}
+	htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */
+
+	/* Find table rows containing episode data. */
+	HtmlDocPtr doc = ctxt->myDoc;
+	XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+	XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(
+	    reinterpret_cast<const xmlChar*>("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"),
+	    xpathCtx);
+	xmlNodeSetPtr nodes = xpathObj->nodesetval;
+	int cNodes = nodes? nodes->nodeNr: 0;
+
+	printf("%d nodes\n", cNodes);
+	for (int i = 0; i < cNodes; i++) {
+		const xmlNodePtr node = nodes->nodeTab[i];
+		if (xmlChildElementCount(node) != 8)
+			throw std::runtime_error("unexpected remote data format");
+
+		/* Get cells. */
+		const xmlNodePtr nodeEp = xmlFirstElementChild(node);
+		const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp));
+		const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle);
+		const xmlNodePtr nodeSource = xmlNextElementSibling(
+		    xmlNextElementSibling(xmlNextElementSibling(nodeDate)));
+		const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource);
+
+		ElvDataA& e = fvElv.At(i);
+		DlvDataA& d = fvDlv.At(i);
+
+		WriteNodeContent(e.siEp, nodeEp);
+		e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */
+		WriteNodeContent(e.title, nodeTitle);
+		WriteNodeContent(d.date, nodeDate);
+		WriteNodeContent(d.source, nodeSource);
+		WriteNodeContent(d.hint, nodeHint);
+	}
+}
diff --git a/c/test.cpp b/c/test.cpp
index b125a20..5bd8958 100644
--- a/c/test.cpp
+++ b/c/test.cpp
@@ -1,8 +1,4 @@
 #include <windows.h>
-#include <wininet.h>
-#include <libxml/HTMLparser.h>
-#include <libxml/HTMLtree.h>
-#include <libxml/xpath.h>
 
 #include "data.h"
 #include "episodelistview.h"
@@ -25,56 +21,6 @@ struct Test
 #define TEST(id) }; struct id : public Test { id() : Test(#id)
 #define FAIL(...) do { Sprintf(error, __VA_ARGS__); return; } while (0)
 
-template <typename F>
-struct Defer
-{
-	Defer(F dtor) : dtor(dtor) {}
-	~Defer() { dtor(); }
-	F dtor;
-};
-
-#define DEFER(x) Defer APPLY(CAT, defer_, __COUNTER__) {[=](){x;}}
-
-struct InternetFile
-{
-	InternetFile(const wchar_t* url);
-	~InternetFile();
-	DWORD Read(void* buf, DWORD cb);
-	HINTERNET hi;
-	HINTERNET hiUrl;
-};
-
-InternetFile::InternetFile(const wchar_t* url)
-{
-	hi = InternetOpen(L"Episode Browser",
-	    INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr,
-	    /*INTERNET_FLAG_ASYNC*/0);
-	if (!hi)
-		throw Win32Error{};
-
-	hiUrl = InternetOpenUrl(hi, url,
-	    nullptr, 0, INTERNET_FLAG_NO_UI, 0);
-	if (!hiUrl) {
-		InternetCloseHandle(hi);
-		throw Win32Error{};
-	}
-}
-
-InternetFile::~InternetFile()
-{
-	InternetCloseHandle(hiUrl);
-	InternetCloseHandle(hi);
-}
-
-DWORD InternetFile::Read(void* buf, DWORD cb)
-{
-	DWORD cbRead;
-	if (InternetReadFile(hiUrl, buf, cb, &cbRead))
-		return cbRead;
-	else
-		throw Win32Error{};
-}
-
 TESTS
 {
 	TEST(StrcpyWithSmallerDestination)
@@ -252,7 +198,7 @@ TESTS
 		//DeleteFile(L"tmp.dat");
 	}
 
-	TEST(DownloadData)
+	TEST(DownloadDataViaProlog)
 	{
 		WcharPtr title, wiki, date, source, hint;
 		int i = 1053;
@@ -276,60 +222,21 @@ TESTS
 		g_cfg.cEp = i;
 	}
 
-	TEST(XML)
+	TEST(Fetch)
 	{
-		LIBXML_TEST_VERSION;
-
-		InternetFile inf{L"https://www.detectiveconanworld.com/wiki/Anime"};
-		char buf[1024];
-
-		htmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr,
-		    buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime",
-		    XML_CHAR_ENCODING_UTF8);
-		DEFER(xmlFreeParserCtxt(ctxt));
-
-		htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING);
-
-		while (DWORD cbRead = inf.Read(&buf, sizeof(buf))) {
-			if (!htmlParseChunk(ctxt, buf, cbRead, 0))
-				FAIL(xmlGetLastError()->message);
-		}
-		htmlParseChunk(ctxt, buf, 0, 1); /* Terminate. */
-
-		htmlDocPtr doc = ctxt->myDoc;
-		if (!doc)
-			FAIL(xmlGetLastError()->message);
-		DEFER(xmlFreeDoc(doc)); /* Needed? */
-
-		xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
-		if (!xpathCtx)
-			FAIL(xmlGetLastError()->message);
-		DEFER(xmlXPathFreeContext(xpathCtx));
-
-		xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(
-		    reinterpret_cast<const xmlChar*>("//tr/td[@style='background:#f2fde9;']"),
-		    xpathCtx);
-		if (!xpathObj)
-			FAIL(xmlGetLastError()->message);
-		DEFER(xmlXPathFreeObject(xpathObj));
-
-		xmlNodeSetPtr nodes;
-		int cNodes;
-		nodes = xpathObj->nodesetval;
-		cNodes = nodes? nodes->nodeNr: 0;
-
-		printf("%d nodes\n", cNodes);
-		// for (int i = 0; i < cNodes; i++) {
-		// 	xmlNodePtr node = nodes->nodeTab[i];
-		// 	printf("node \"%s\": type %d\n", node->name, node->type);
-		// }
+		extern FileView<ElvDataA> g_fvElv;
+		extern FileView<DlvDataA> g_fvDlv;
+		void FetchData(FileView<ElvDataA>& fvElv, FileView<DlvDataA>& fvDlv);
+		//FileView<ElvDataA> fvElv{L"testelv.dat", 1080};
+		//FileView<DlvDataA> fvDlv{L"testdlv.dat", 1080};
+		FetchData(g_fvElv, g_fvDlv);
 	}
 };
 
 int RunTests()
 {
 	const Test tests[] = {
-		//StrcpyWithSmallerDestination{},
+		StrcpyWithSmallerDestination{},
 		//EpisodeDataFromWeb{},
 		//EpisodeDataFromProlog{},
 		//IO{},
@@ -337,10 +244,8 @@ int RunTests()
 		//SampleConfigurationToDisk{},
 		//MigrateCfg{}
 		//MigrateDlvDataFromPrologToDisk{},
-		//DownloadData{},
-		XML{},
-		//ImportElvData{},
-		//ImportDlvData{},
+		//DownloadDataViaProlog{},
+		Fetch{},
 	};
 
 	printf("Results (%llu tests):\n", sizeof(tests)/sizeof(*tests));
-- 
cgit v1.2.3