aboutsummaryrefslogtreecommitdiff
path: root/c/data.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'c/data.cpp')
-rw-r--r--c/data.cpp152
1 files changed, 152 insertions, 0 deletions
diff --git a/c/data.cpp b/c/data.cpp
index 9dd6ef8..1e59b8b 100644
--- a/c/data.cpp
+++ b/c/data.cpp
@@ -1 +1,153 @@
+#include <windows.h>
+#include <wininet.h>
+#include <libxml/HTMLparser.h>
+#include <libxml/HTMLtree.h>
+#include <libxml/xpath.h>
+
#include "data.h"
+
+struct InternetFile
+{
+ InternetFile(const wchar_t* url);
+ ~InternetFile();
+ DWORD Read(void* buf, DWORD cb);
+ HINTERNET hi;
+ HINTERNET hiUrl;
+};
+
+InternetFile::InternetFile(const wchar_t* url)
+{
+ hi = InternetOpen(L"Episode Browser",
+ INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr,
+ /*INTERNET_FLAG_ASYNC*/0);
+ if (!hi)
+ throw Win32Error{};
+
+ hiUrl = InternetOpenUrl(hi, url,
+ nullptr, 0, INTERNET_FLAG_NO_UI, 0);
+ if (!hiUrl) {
+ InternetCloseHandle(hi);
+ throw Win32Error{};
+ }
+}
+
+InternetFile::~InternetFile()
+{
+ InternetCloseHandle(hiUrl);
+ InternetCloseHandle(hi);
+}
+
+DWORD InternetFile::Read(void* buf, DWORD cb)
+{
+ DWORD cbRead;
+ if (InternetReadFile(hiUrl, buf, cb, &cbRead))
+ return cbRead;
+ else
+ throw Win32Error{};
+}
+
+template <auto F, typename T>
+struct XmlPtr
+{
+ XmlPtr(T v) : v(v)
+ {
+ if (!v)
+ throw std::runtime_error(xmlGetLastError()->message);
+ }
+ ~XmlPtr() { F(v); }
+ operator T() { return v; }
+ T operator ->() { return v; }
+private:
+ T v;
+};
+
+using HtmlParserCtxtPtr = XmlPtr<xmlFreeParserCtxt, htmlParserCtxtPtr>;
+using HtmlDocPtr = XmlPtr<xmlFreeDoc, htmlDocPtr>;
+using XmlXPathContextPtr = XmlPtr<xmlXPathFreeContext, xmlXPathContextPtr>;
+using XmlXPathObjectPtr = XmlPtr<xmlXPathFreeObject, xmlXPathObjectPtr>;
+static inline void XmlFree(void* p) { xmlFree(p); }
+using XmlCharPtr = XmlPtr<XmlFree, xmlChar*>;
+
+template <size_t N>
+bool WriteNodeContent(wchar_t (&dst)[N], const xmlNodePtr node)
+{
+ XmlCharPtr utf8 = xmlNodeGetContent(node);
+ if (!utf8)
+ return false;
+
+ /* Truncate if source is larger than destination. */
+ utf8[N] = 0;
+ int lenUtf8 = xmlStrlen(utf8);
+
+ /* Convert internal representation from UTF-8 to Latin-1,
+ * which seems to actually convert the string to proper UTF-8
+ * (???). */
+ unsigned char lat1[N];
+ int lenLat1 = N-1;
+ if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0)
+ return false;
+ lat1[lenLat1] = 0;
+
+ /* Write wide string to destination, if it fits. */
+ char* const src = reinterpret_cast<char*>(lat1);
+ const int cchNarrow = lenLat1+1;
+ const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0);
+ if (static_cast<size_t>(cchWide) > N)
+ return false;
+ return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide);
+}
+
+void FetchData(FileView<ElvDataA>& fvElv, FileView<DlvDataA>& fvDlv)
+{
+ LIBXML_TEST_VERSION;
+
+ //InternetFile inf{L"https://www.detectiveconanworld.com/wiki/Anime"};
+ InternetFile inf{L"file://C:/Users/John/Desktop/dcw.html"};
+ char buf[1024];
+
+ HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr,
+ buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime",
+ XML_CHAR_ENCODING_UTF8);
+
+ htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING);
+
+ while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) {
+ if (!htmlParseChunk(ctxt, buf, cbRead, 0))
+ throw std::runtime_error(xmlGetLastError()->message);
+ }
+ htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */
+
+ /* Find table rows containing episode data. */
+ HtmlDocPtr doc = ctxt->myDoc;
+ XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+ XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(
+ reinterpret_cast<const xmlChar*>("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"),
+ xpathCtx);
+ xmlNodeSetPtr nodes = xpathObj->nodesetval;
+ int cNodes = nodes? nodes->nodeNr: 0;
+
+ printf("%d nodes\n", cNodes);
+ for (int i = 0; i < cNodes; i++) {
+ const xmlNodePtr node = nodes->nodeTab[i];
+ if (xmlChildElementCount(node) != 8)
+ throw std::runtime_error("unexpected remote data format");
+
+ /* Get cells. */
+ const xmlNodePtr nodeEp = xmlFirstElementChild(node);
+ const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp));
+ const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle);
+ const xmlNodePtr nodeSource = xmlNextElementSibling(
+ xmlNextElementSibling(xmlNextElementSibling(nodeDate)));
+ const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource);
+
+ ElvDataA& e = fvElv.At(i);
+ DlvDataA& d = fvDlv.At(i);
+
+ WriteNodeContent(e.siEp, nodeEp);
+ e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */
+ WriteNodeContent(e.title, nodeTitle);
+ WriteNodeContent(d.date, nodeDate);
+ WriteNodeContent(d.source, nodeSource);
+ WriteNodeContent(d.hint, nodeHint);
+ }
+}