diff options
Diffstat (limited to 'c/data.cpp')
-rw-r--r-- | c/data.cpp | 152 |
1 files changed, 152 insertions, 0 deletions
@@ -1 +1,153 @@ +#include <windows.h> +#include <wininet.h> +#include <libxml/HTMLparser.h> +#include <libxml/HTMLtree.h> +#include <libxml/xpath.h> + #include "data.h" + +struct InternetFile +{ + InternetFile(const wchar_t* url); + ~InternetFile(); + DWORD Read(void* buf, DWORD cb); + HINTERNET hi; + HINTERNET hiUrl; +}; + +InternetFile::InternetFile(const wchar_t* url) +{ + hi = InternetOpen(L"Episode Browser", + INTERNET_OPEN_TYPE_DIRECT, nullptr, nullptr, + /*INTERNET_FLAG_ASYNC*/0); + if (!hi) + throw Win32Error{}; + + hiUrl = InternetOpenUrl(hi, url, + nullptr, 0, INTERNET_FLAG_NO_UI, 0); + if (!hiUrl) { + InternetCloseHandle(hi); + throw Win32Error{}; + } +} + +InternetFile::~InternetFile() +{ + InternetCloseHandle(hiUrl); + InternetCloseHandle(hi); +} + +DWORD InternetFile::Read(void* buf, DWORD cb) +{ + DWORD cbRead; + if (InternetReadFile(hiUrl, buf, cb, &cbRead)) + return cbRead; + else + throw Win32Error{}; +} + +template <auto F, typename T> +struct XmlPtr +{ + XmlPtr(T v) : v(v) + { + if (!v) + throw std::runtime_error(xmlGetLastError()->message); + } + ~XmlPtr() { F(v); } + operator T() { return v; } + T operator ->() { return v; } +private: + T v; +}; + +using HtmlParserCtxtPtr = XmlPtr<xmlFreeParserCtxt, htmlParserCtxtPtr>; +using HtmlDocPtr = XmlPtr<xmlFreeDoc, htmlDocPtr>; +using XmlXPathContextPtr = XmlPtr<xmlXPathFreeContext, xmlXPathContextPtr>; +using XmlXPathObjectPtr = XmlPtr<xmlXPathFreeObject, xmlXPathObjectPtr>; +static inline void XmlFree(void* p) { xmlFree(p); } +using XmlCharPtr = XmlPtr<XmlFree, xmlChar*>; + +template <size_t N> +bool WriteNodeContent(wchar_t (&dst)[N], const xmlNodePtr node) +{ + XmlCharPtr utf8 = xmlNodeGetContent(node); + if (!utf8) + return false; + + /* Truncate if source is larger than destination. */ + utf8[N] = 0; + int lenUtf8 = xmlStrlen(utf8); + + /* Convert internal representation from UTF-8 to Latin-1, + * which seems to actually convert the string to proper UTF-8 + * (???). */ + unsigned char lat1[N]; + int lenLat1 = N-1; + if (UTF8Toisolat1(lat1, &lenLat1, utf8, &lenUtf8) <= 0) + return false; + lat1[lenLat1] = 0; + + /* Write wide string to destination, if it fits. */ + char* const src = reinterpret_cast<char*>(lat1); + const int cchNarrow = lenLat1+1; + const int cchWide = MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, nullptr, 0); + if (static_cast<size_t>(cchWide) > N) + return false; + return MultiByteToWideChar(CP_UTF8, 0, src, cchNarrow, dst, cchWide); +} + +void FetchData(FileView<ElvDataA>& fvElv, FileView<DlvDataA>& fvDlv) +{ + LIBXML_TEST_VERSION; + + //InternetFile inf{L"https://www.detectiveconanworld.com/wiki/Anime"}; + InternetFile inf{L"file://C:/Users/John/Desktop/dcw.html"}; + char buf[1024]; + + HtmlParserCtxtPtr ctxt = htmlCreatePushParserCtxt(nullptr, nullptr, + buf, sizeof(buf), "https://www.detectiveconanworld.com/wiki/Anime", + XML_CHAR_ENCODING_UTF8); + + htmlCtxtUseOptions(ctxt, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); + + while (const DWORD cbRead = inf.Read(&buf, sizeof(buf))) { + if (!htmlParseChunk(ctxt, buf, cbRead, 0)) + throw std::runtime_error(xmlGetLastError()->message); + } + htmlParseChunk(ctxt, buf, 0, 1); /* Stop parsing. */ + + /* Find table rows containing episode data. */ + HtmlDocPtr doc = ctxt->myDoc; + XmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + XmlXPathObjectPtr xpathObj = xmlXPathEvalExpression( + reinterpret_cast<const xmlChar*>("//tr[./td[1] != '' and ./td[3][@style='background:#f2fde9;']]"), + xpathCtx); + xmlNodeSetPtr nodes = xpathObj->nodesetval; + int cNodes = nodes? nodes->nodeNr: 0; + + printf("%d nodes\n", cNodes); + for (int i = 0; i < cNodes; i++) { + const xmlNodePtr node = nodes->nodeTab[i]; + if (xmlChildElementCount(node) != 8) + throw std::runtime_error("unexpected remote data format"); + + /* Get cells. */ + const xmlNodePtr nodeEp = xmlFirstElementChild(node); + const xmlNodePtr nodeTitle = xmlNextElementSibling(xmlNextElementSibling(nodeEp)); + const xmlNodePtr nodeDate = xmlNextElementSibling(nodeTitle); + const xmlNodePtr nodeSource = xmlNextElementSibling( + xmlNextElementSibling(xmlNextElementSibling(nodeDate))); + const xmlNodePtr nodeHint = xmlNextElementSibling(nodeSource); + + ElvDataA& e = fvElv.At(i); + DlvDataA& d = fvDlv.At(i); + + WriteNodeContent(e.siEp, nodeEp); + e.siEp[wcscspn(e.siEp, L"W")] = 0; /* Remove WPS suffix. */ + WriteNodeContent(e.title, nodeTitle); + WriteNodeContent(d.date, nodeDate); + WriteNodeContent(d.source, nodeSource); + WriteNodeContent(d.hint, nodeHint); + } +} |