1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
#include <windows.h>
#include <wininet.h>
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/xpath.h>
#include "data.h"
#include "win.h"
struct XmlError : public std::exception
{
const char* msg;
XmlError()
{
msg = xmlGetLastError()->message;
}
virtual const char* what() const noexcept
{
return msg;
}
};
using HtmlParserCtxtPtr = Managed<htmlParserCtxtPtr, xmlFreeParserCtxt, XmlError>;
using HtmlDocPtr = Managed<htmlDocPtr, xmlFreeDoc, XmlError>;
using XmlXPathContextPtr = Managed<xmlXPathContextPtr, xmlXPathFreeContext, XmlError>;
using XmlXPathObjectPtr = Managed<xmlXPathObjectPtr, xmlXPathFreeObject, XmlError>;
static inline void XmlFree(void* p) { xmlFree(p); }
using XmlCharPtr = Managed<xmlChar*, XmlFree, XmlError>;
template <size_t N>
bool WcharsFromXmlchars(wchar_t (&dst)[N], XmlCharPtr utf8) noexcept
{
/* Truncate if source is larger than destination. */
int lenUtf8 = xmlStrlen(utf8);
utf8[Min(N, static_cast<size_t>(lenUtf8))] = 0;
/* Convert internal representation from UTF-8 to Latin-1,
* which seems to actually convert the string to proper UTF-8
* (???). */
unsigned char lat1[N];
int lenLat1 = N-1;
if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0)
return false;
lat1[lenLat1] = 0;
/* Write wide string to destination, if it fits. */
char* const src = reinterpret_cast<char*>(lat1);
const int cchNarrow = lenLat1+1;
const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0);
if (static_cast<size_t>(cchWide) > N)
return false;
return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide);
}
void FetchData()
{
LIBXML_TEST_VERSION;
using InternetHandle = Managed<HINTERNET, InternetCloseHandle, InternetError>;
/* The remote data is retrieved using WinINet from the
* Detective Conan World wiki. Using libxml2's "push parser",
* the HTML is parsed piece by piece as it is retrieved. */
InternetHandle hi = InternetOpen(L"Episode Browser", INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, 0);
InternetHandle hiUrl = InternetOpenUrl(hi, L"https://www.detectiveconanworld.com/wiki/Anime",
nullptr, 0, INTERNET_FLAG_NO_UI, 0); //L"file://C:/Users/John/Desktop/dcw.html"
char buf[1024];
HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr,
buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime",
XML_CHAR_ENCODING_UTF8);
htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING);
BOOL r;
DWORD cbRead;
while (r = InternetReadFile(hiUrl, buf, sizeof(buf), &cbRead), cbRead) {
if (!r)
throw InternetError();
if (!htmlParseChunk(ctxt, buf, cbRead, 0))
throw std::runtime_error(xmlGetLastError()->message);
}
htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */
/* The episode data are contained in table rows matching a
* (very!) specific XPath query. This is fragile
* theoretically, but unlikely to break practically. */
HtmlDocPtr doc = ctxt->myDoc;
XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(
reinterpret_cast<const xmlChar*>("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"),
xpathCtx);
xmlNodeSetPtr nodes = xpathObj->nodesetval;
int cNodes = nodes? nodes->nodeNr: 0;
if (!cNodes)
throw std::runtime_error("could not find remote episode information");
for (int i = 0; i < cNodes; i++) {
extern FileView<ElvDataA> g_fvElv;
extern FileView<DlvDataA> g_fvDlv;
const xmlNodePtr node = nodes->nodeTab[i];
if (xmlChildElementCount(node) != 8)
throw std::runtime_error("unexpected remote data format");
ElvDataA& e = g_fvElv.At(i);
DlvDataA& d = g_fvDlv.At(i);
/* Each datum is contained within a specific cell in
* the row. The child element count above ensures that
* none of the following nodes are null. */
const xmlNodePtr nodeEp = xmlFirstElementChild(node);
const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp));
const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle);
const xmlNodePtr nodeSource = xmlNextElementSibling(
xmlNextElementSibling(xmlNextElementSibling(nodeDate)));
const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource);
WcharsFromXmlchars(d.date, xmlNodeGetContent(nodeDate));
WcharsFromXmlchars(d.source, xmlNodeGetContent(nodeSource));
WcharsFromXmlchars(d.hint, xmlNodeGetContent(nodeHint));
e.bTVOriginal = wcsncmp(d.source, L"TV", 2) == 0? 1: 0;
WcharsFromXmlchars(e.siEp, xmlNodeGetContent(nodeEp));
e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove potential "WPS" suffix. */
WcharsFromXmlchars(e.title, xmlNodeGetContent(nodeTitle));
/* Retrieve the link to the episode's wiki entry,
* which should be the first (and only) child element
* of the title node. */
const xmlNodePtr nodeLink = xmlFirstElementChild(nodeTitle);
if (nodeLink)
WcharsFromXmlchars(d.wiki, xmlGetProp(nodeLink, reinterpret_cast<const xmlChar*>("href")));
}
}
void WaitFetchData(bool* bDone) noexcept
{
try {
FetchData();
*bDone = true;
} catch (...) {
*bDone = true;
ShowException(L"Remote data could not be fetched due to an error: %s", L"Error", MB_ICONWARNING);
}
}
|