c/data.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

#include <windows.h>
#include <wininet.h>
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/xpath.h>

#include "data.h"
#include "win.h"

struct XmlError : public std::exception
{
	const char* msg;
	XmlError()
	{
		msg = xmlGetLastError()->message;
	}
	virtual const char* what() const noexcept
	{
		return msg;
	}
};

using XmlXPathContextPtr = Managed<xmlXPathContextPtr, xmlXPathFreeContext, XmlError>;
using XmlXPathObjectPtr = Managed<xmlXPathObjectPtr, xmlXPathFreeObject, XmlError>;
static inline void XmlFree(void* p) { xmlFree(p); }
using XmlCharPtr = Managed<xmlChar*, XmlFree, XmlError>;

using InternetHandle = Managed<HINTERNET, InternetCloseHandle, InternetError>;
static InternetHandle s_hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0);

struct ParsedDoc
{
	using HtmlParserCtxtPtr = Managed<htmlParserCtxtPtr, xmlFreeParserCtxt, XmlError>;
	using HtmlDocPtr = Managed<htmlDocPtr, xmlFreeDoc, XmlError>;

	InternetHandle hiUrl;
	HtmlParserCtxtPtr ctxt;
	char bufI[1024];
	char bufX[1024];

	ParsedDoc(const wchar_t* wszUrl, const char* szUrl)
		: hiUrl(InternetOpenUrl(s_hi, wszUrl, nullptr, 0, INTERNET_FLAG_NO_UI, 0)),
		  ctxt(htmlCreatePushParserCtxt(nullptr, nullptr, bufX, sizeof(bufX), szUrl, XML_CHAR_ENCODING_UTF8))
	{
		htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING);

		BOOL r;
		DWORD cbRead;
		while (r = InternetReadFile(hiUrl, bufI, sizeof(bufI), &cbRead), cbRead) {
			if (!r)
				throw InternetError();
			if (!htmlParseChunk(ctxt, bufI, cbRead, 0))
				throw XmlError();
		}
		htmlParseChunk(ctxt, bufI, 0, 1); /* Stop parsing. */
	}

	operator htmlDocPtr() { return ctxt->myDoc; }
};

template <size_t N>
bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept
{
	/* Truncate if source is larger than destination. */
	int lenUtf8 = xmlStrlen(utf8);
	utf8[Min(N, static_cast<size_t>(lenUtf8))] = 0;

	/* Convert internal representation from UTF-8 to Latin-1,
	 * which seems to actually convert the string to proper UTF-8
	 * (???). */
	unsigned char lat1[N];
	int lenLat1 = N-1;
	if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0)
		return false;
	lat1[lenLat1] = 0;

	/* Write wide string to destination, if it fits. */
	char* const src = reinterpret_cast<char*>(lat1);
	const int cchNarrow = lenLat1+1;
	const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0);
	if (static_cast<size_t>(cchWide) > N)
		return false;
	return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide);
}

void FetchData()
{
	/* The remote data is retrieved using WinINet from the
	 * Detective Conan World wiki. Using libxml2's "push parser",
	 * the HTML is parsed piece by piece as it is retrieved. The
	 * episode data are contained in table rows matching a (very!)
	 * specific XPath query. This is fragile theoretically, but
	 * unlikely to break practically. */

	ParsedDoc doc(L"https://www.detectiveconanworld.com/wiki/Anime",
	    "https://www.detectiveconanworld.com/wiki/Anime");
	XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
	XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(
	    reinterpret_cast<const xmlChar*>("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"),
	    xpathCtx);
	xmlNodeSetPtr nodes = xpathObj->nodesetval;
	int cNodes = nodes? nodes->nodeNr: 0;

	if (!cNodes)
		throw std::runtime_error("could not find remote episode information");

	for (int i = 0; i < cNodes; i++) {
		extern FileView<ElvDataA> g_fvElv;
		extern FileView<DlvDataA> g_fvDlv;

		const xmlNodePtr node = nodes->nodeTab[i];
		if (xmlChildElementCount(node) != 8)
			throw std::runtime_error("unexpected remote data format");

		ElvDataA& e = g_fvElv.At(i);
		DlvDataA& d = g_fvDlv.At(i);

		/* Each datum is contained within a specific cell in
		 * the row. The child element count above ensures that
		 * none of the following nodes are null. */
		const xmlNodePtr nodeEp = xmlFirstElementChild(node);
		const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp));
		const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle);
		const xmlNodePtr nodeSource = xmlNextElementSibling(
		    xmlNextElementSibling(xmlNextElementSibling(nodeDate)));
		const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource);

		WcharsFromXmlchars(d.date, xmlNodeGetContent(nodeDate));
		WcharsFromXmlchars(d.source, xmlNodeGetContent(nodeSource));
		WcharsFromXmlchars(d.hint, xmlNodeGetContent(nodeHint));
		e.bTVOriginal = wcsncmp(d.source, L"TV", 2) == 0? 1: 0;
		WcharsFromXmlchars(e.siEp, xmlNodeGetContent(nodeEp));
		e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove potential "WPS" suffix. */
		WcharsFromXmlchars(e.title, xmlNodeGetContent(nodeTitle));

		/* Retrieve the link to the episode's wiki entry,
		 * which should be the first (and only) child element
		 * of the title node. */
		const xmlNodePtr nodeLink = xmlFirstElementChild(nodeTitle);
		if (nodeLink)
			WcharsFromXmlchars(d.wiki, xmlGetProp(nodeLink, reinterpret_cast<const xmlChar*>("href")));
	}
}

void WaitFetchData(bool* bDone) noexcept
{
	try {
		FetchData();
		*bDone = true;
	} catch (...) {
		*bDone = true;
		ShowException(L"Remote data could not be fetched due to an error: %s", L"Error", MB_ICONWARNING);
	}
}