aboutsummaryrefslogtreecommitdiff
path: root/c/data.cpp
blob: 4bded44c2b5d0eefc0005c649d135ce4dd94c404 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#include <algorithm>
#include <windows.h>
#include <wininet.h>
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/xpath.h>

#include "data.h"

struct InternetFile
{
	InternetFile(const wchar_t* url);
	~InternetFile();
	DWORD Read(void* buf, DWORD cb);
	HINTERNET hi;
	HINTERNET hiUrl;
};

InternetFile::InternetFile(const wchar_t* url)
{
	hi = InternetOpen(L"Episode Browser",
	    INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr,
	    /*INTERNET_FLAG_ASYNC*/0);
	if (!hi)
		throw Win32Error{};

	hiUrl = InternetOpenUrl(hi, url,
	    nullptr, 0, INTERNET_FLAG_NO_UI, 0);
	if (!hiUrl) {
		InternetCloseHandle(hi);
		throw Win32Error{};
	}
}

InternetFile::~InternetFile()
{
	InternetCloseHandle(hiUrl);
	InternetCloseHandle(hi);
}

DWORD InternetFile::Read(void* buf, DWORD cb)
{
	DWORD cbRead;
	if (InternetReadFile(hiUrl, buf, cb, &cbRead))
		return cbRead;
	else
		throw Win32Error{};
}

template <auto F, typename T>
struct XmlPtr
{
	XmlPtr(T v) : v(v)
	{
		if (!v)
			throw std::runtime_error(xmlGetLastError()->message);
	}
	~XmlPtr() { F(v); }
	operator T() { return v; }
	T operator ->() { return v; }
private:
	T v;
};

using HtmlParserCtxtPtr = XmlPtr<xmlFreeParserCtxt, htmlParserCtxtPtr>;
using HtmlDocPtr = XmlPtr<xmlFreeDoc, htmlDocPtr>;
using XmlXPathContextPtr = XmlPtr<xmlXPathFreeContext, xmlXPathContextPtr>;
using XmlXPathObjectPtr = XmlPtr<xmlXPathFreeObject, xmlXPathObjectPtr>;
static inline void XmlFree(void* p) { xmlFree(p); }
using XmlCharPtr = XmlPtr<XmlFree, xmlChar*>;

template <size_t N>
bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8)
{
	/* Truncate if source is larger than destination. */
	int lenUtf8 = xmlStrlen(utf8);
	utf8[std::min(N, static_cast<size_t>(lenUtf8))] = 0;

	/* Convert internal representation from UTF-8 to Latin-1,
	 * which seems to actually convert the string to proper UTF-8
	 * (???). */
	unsigned char lat1[N];
	int lenLat1 = N-1;
	if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0)
		return false;
	lat1[lenLat1] = 0;

	/* Write wide string to destination, if it fits. */
	char* const src = reinterpret_cast<char*>(lat1);
	const int cchNarrow = lenLat1+1;
	const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0);
	if (static_cast<size_t>(cchWide) > N)
		return false;
	return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide);
}

void FetchData(bool* bDone)
{
	LIBXML_TEST_VERSION;

	InternetFile inf{L"https://www.detectiveconanworld.com/wiki/Anime"};
	//InternetFile inf{L"file://C:/Users/John/Desktop/dcw.html"};
	char buf[1024];

	HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr,
	    buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime",
	    XML_CHAR_ENCODING_UTF8);

	htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING);

	while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) {
		if (!htmlParseChunk(ctxt, buf, cbRead, 0))
			throw std::runtime_error(xmlGetLastError()->message);
	}
	htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */

	/* Find table rows containing episode data. */
	HtmlDocPtr doc = ctxt->myDoc;
	XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
	XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(
	    reinterpret_cast<const xmlChar*>("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"),
	    xpathCtx);
	xmlNodeSetPtr nodes = xpathObj->nodesetval;
	int cNodes = nodes? nodes->nodeNr: 0;

	printf("%d nodes\n", cNodes);
	for (int i = 0; i < cNodes; i++) {
		extern FileView<ElvDataA> g_fvElv;
		extern FileView<DlvDataA> g_fvDlv;

		const xmlNodePtr node = nodes->nodeTab[i];
		if (xmlChildElementCount(node) != 8)
			throw std::runtime_error("unexpected remote data format");

		ElvDataA& e = g_fvElv.At(i);
		DlvDataA& d = g_fvDlv.At(i);

		/* Get cells. */
		const xmlNodePtr nodeEp = xmlFirstElementChild(node);
		const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp));
		const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle);
		const xmlNodePtr nodeSource = xmlNextElementSibling(
		    xmlNextElementSibling(xmlNextElementSibling(nodeDate)));
		const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource);

		WcharsFromXmlchars(e.siEp, xmlNodeGetContent(nodeEp));
		e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */
		WcharsFromXmlchars(e.title, xmlNodeGetContent(nodeTitle));
		WcharsFromXmlchars(d.date, xmlNodeGetContent(nodeDate));
		WcharsFromXmlchars(d.source, xmlNodeGetContent(nodeSource));
		WcharsFromXmlchars(d.hint, xmlNodeGetContent(nodeHint));

		/* Get wiki URL. */
		const xmlNodePtr nodeLink = xmlFirstElementChild(nodeTitle);
		if (nodeLink)
			WcharsFromXmlchars(d.wiki, xmlGetProp(nodeLink, (const xmlChar*)"href"));
	}

	*bDone = true;
}