diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..d7ff5c3fda3ca08f08c390e796ac05227bc320de --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,26 @@ +authors: + - family-names: Pirinen + given-names: Flammie A + orcid: "https://orcid.org/0000-0003-1207-5395" + - family-names: Hardwick + given-names: Sam +cff-version: 1.2.0 +date-released: "2022-03-13" +keywords: + - spell-checking + - nlp +message: If you use this software, please cite it using these metadata. +repository-code: "https://github.com/hfst/hfst-ospell" +title: HFST ospell +version: 0.5.3 +preferred-citation: + authors: + - family-names: Pirinen + given-names: Flammie A + - family-names: Hardwick + given-names: Sam + - family-names: Lindén + given-names: Krister + title: "Effect of language and error models on efficiency of finite-state spell-checking and correction" + type: article +license: GPL-3.0 diff --git a/Makefile.am b/Makefile.am index 922cefcba2e7248522d9f2814af54f11443af97a..222166fef7a1c4b19191a5059d99240993b1c2e4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -27,8 +27,12 @@ endif # EXTRA_DEMOS if HFST_OSPELL_OFFICE MAYBE_HFST_OSPELL_OFFICE=hfst-ospell-office endif # HFST_OSPELL_OFFICE +if HFST_OSPELL_PREDICT +MAYBE_HFST_OSPELL_PREDICT=hfst-ospell-predict +endif -bin_PROGRAMS=hfst-ospell $(MAYBE_HFST_OSPELL_OFFICE) $(CONFERENCE_DEMOS) +bin_PROGRAMS=hfst-ospell $(MAYBE_HFST_OSPELL_OFFICE) $(CONFERENCE_DEMOS) \ + $(MAYBE_HFST_OSPELL_PREDICT) lib_LTLIBRARIES=libhfstospell.la man1_MANS=hfst-ospell.1 hfst-ospell-office.1 @@ -63,6 +67,13 @@ hfst_ospell_LDADD=libhfstospell.la hfst_ospell_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ $(PKG_CXXFLAGS) +if HFST_OSPELL_PREDICT +hfst_ospell_predict_SOURCES=predict.cc +hfst_ospell_predict_LDADD=libhfstospell.la +hfst_ospell_predict_CXXFLAGS=$(AM_CXXFLAGS) $(CXXFLAGS) \ + $(PKG_CXXFLAGS) +endif + if HFST_OSPELL_OFFICE hfst_ospell_office_SOURCES=office.cc diff --git a/README b/README.md similarity index 100% rename from README rename to README.md diff --git a/ZHfstOspeller.cc b/ZHfstOspeller.cc index b5b4886889b5715abb95e1196ed5822f765653fd..5a669b1d29ccd41625acceabe5eaab4e0bddebb2 100644 --- a/ZHfstOspeller.cc +++ b/ZHfstOspeller.cc @@ -58,7 +58,7 @@ inline std::string extract_to_mem(archive* ar, archive_entry* entry) { std::string buff(buffsize, 0); for (;;) { - ssize_t curr = archive_read_data(ar, &buff[0] + full_length, buffsize - full_length); + auto curr = archive_read_data(ar, &buff[0] + full_length, buffsize - full_length); if (0 == curr) { break; } diff --git a/configure.ac b/configure.ac index bc5fc9dbee690edfc550db765f5d0fc9db14f504..4f1cb6661ce033acab9578eaa977b79602369b80 100644 --- a/configure.ac +++ b/configure.ac @@ -17,7 +17,7 @@ # autoconf requirements AC_PREREQ([2.62]) -AC_INIT([hfstospell], [0.5.3], [hfst-bugs@helsinki.fi], [hfstospell], [http://hfst.github.io]) +AC_INIT([hfstospell],[0.5.4],[hfst-bugs@helsinki.fi],[hfstospell],[http://hfst.github.io]) LT_PREREQ([2.2.6]) @@ -34,7 +34,7 @@ AC_CONFIG_HEADERS([config.h]) HFSTOSPELL_NAME=hfstospell HFSTOSPELL_MAJOR=0 HFSTOSPELL_MINOR=5 -HFSTOSPELL_EXTENSION=.3 +HFSTOSPELL_EXTENSION=.4 HFSTOSPELL_VERSION=$HFSTOSPELL_MAJOR.$HFSTOSPELL_MINOR$HFSTOSPELL_EXTENSION AC_SUBST(HFSTOSPELL_MAJOR) AC_SUBST(HFSTOSPELL_MINOR) @@ -54,8 +54,13 @@ AM_CONDITIONAL([EXTRA_DEMOS], [test x$enable_extra_demos != xno]) AC_ARG_ENABLE([hfst_ospell_office], [AS_HELP_STRING([--enable-hfst-ospell-office], [build hfst-ospell-office @<:@default=yes@:>@])], - [enable_hfst_ospell_ofiice=$enableval], [enable_hfst_ospell_office=yes]) + [enable_hfst_ospell_office=$enableval], [enable_hfst_ospell_office=yes]) AM_CONDITIONAL([HFST_OSPELL_OFFICE], [test x$enable_hfst_ospell_office != xno]) +AC_ARG_ENABLE([hfst_ospell_predict], + [AS_HELP_STRING([--enable-hfst-ospell-predict], + [build hfst-ospell-predict @<:@default=yes@:>@])], + [enable_hfst_ospell_predict=$enableval], [enable_hfst_ospell_predict=yes]) +AM_CONDITIONAL([HFST_OSPELL_PREDICT], [test x$enable_hfst_ospell_predict != xno]) AC_ARG_ENABLE([zhfst], [AS_HELP_STRING([--enable-zhfst], [support zipped complex automaton sets @<:@default=check@:>@])], @@ -84,7 +89,7 @@ AS_IF([test "x$with_extract" = xmem], [AC_DEFINE([ZHFST_EXTRACT_TO_MEM], [1], m4_ifdef([AM_PROG_AR], [AM_PROG_AR]) AC_PROG_CC AC_PROG_CXX -AC_LIBTOOL_WIN32_DLL + LT_INIT AC_PROG_INSTALL AC_PROG_LN_S @@ -146,17 +151,17 @@ AC_CHECK_FUNCS([strndup error]) # Require highest supported C++ standard AC_LANG(C++) -AX_CHECK_COMPILE_FLAG([-std=c++20], [CXXFLAGS="$CXXFLAGS -std=c++20"], [ - AX_CHECK_COMPILE_FLAG([-std=c++2a], [CXXFLAGS="$CXXFLAGS -std=c++2a"], [ - AX_CHECK_COMPILE_FLAG([-std=c++17], [CXXFLAGS="$CXXFLAGS -std=c++17"], [ - AX_CHECK_COMPILE_FLAG([-std=c++1z], [CXXFLAGS="$CXXFLAGS -std=c++1z"], [ - AX_CHECK_COMPILE_FLAG([-std=c++14], [CXXFLAGS="$CXXFLAGS -std=c++14"], [ - AX_CHECK_COMPILE_FLAG([-std=c++1y], [CXXFLAGS="$CXXFLAGS -std=c++1y"], [ - AC_MSG_ERROR([Could not enable at least C++1y (C++14) - upgrade your compiler]) - ]) - ]) - ]) - ]) +AX_CHECK_COMPILE_FLAG([-std=c++23], [CXXFLAGS="$CXXFLAGS -std=c++23"], [ + AX_CHECK_COMPILE_FLAG([-std=c++2b], [CXXFLAGS="$CXXFLAGS -std=c++2b"], [ + AX_CHECK_COMPILE_FLAG([-std=c++20], [CXXFLAGS="$CXXFLAGS -std=c++20"], [ + AX_CHECK_COMPILE_FLAG([-std=c++2a], [CXXFLAGS="$CXXFLAGS -std=c++2a"], [ + AX_CHECK_COMPILE_FLAG([-std=c++17], [CXXFLAGS="$CXXFLAGS -std=c++17"], [ + AX_CHECK_COMPILE_FLAG([-std=c++1z], [CXXFLAGS="$CXXFLAGS -std=c++1z"], [ + AC_MSG_ERROR([Could not enable at least C++1z (C++17) - upgrade your compiler]) + ]) + ]) + ]) + ]) ]) ]) diff --git a/debian/changelog b/debian/changelog index 769949ded935f2deffa426d9e03b37238d0ea003..2d97cf23a4c609316534d830c713d79e86b1fad2 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,17 @@ +hfst-ospell (0.5.4-1) unstable; urgency=medium + + * Update to latest upstream + + Fix FTBFS by Michael.Karcher@fu-berlin.de (Closes: #988129) + + -- Tino Didriksen <tino@didriksen.cc> Tue, 20 Feb 2024 12:46:23 +0100 + +hfst-ospell (0.5.3-2) unstable; urgency=medium + + * Team upload + * Fix gcc-13 build with upstream patch (Closes: #1037690) + + -- Bastian Germann <bage@debian.org> Tue, 08 Aug 2023 21:42:28 +0200 + hfst-ospell (0.5.3-1) unstable; urgency=low [ Tino Didriksen ] diff --git a/debian/control b/debian/control index b27d6218505838fe454f322bfdd6e3050e77f6cf..6d0de238415f7507081e4ee6eed7c3412ebca79c 100644 --- a/debian/control +++ b/debian/control @@ -1,7 +1,7 @@ Source: hfst-ospell Section: science Priority: optional -Maintainer: Debian Science Team <debian-science-maintainers@alioth-lists.debian.net> +Maintainer: Debian Science Maintainers <debian-science-maintainers@alioth-lists.debian.net> Uploaders: Tino Didriksen <tino@didriksen.cc>, Kartik Mistry <kartik@debian.org> Build-Depends: autoconf, @@ -10,7 +10,7 @@ Build-Depends: autoconf, libicu-dev, pkg-config, zip -Standards-Version: 4.6.0 +Standards-Version: 4.6.2 Homepage: https://github.com/hfst/hfst-ospell Vcs-Git: https://salsa.debian.org/science-team/hfst-ospell.git Vcs-Browser: https://salsa.debian.org/science-team/hfst-ospell @@ -31,6 +31,8 @@ Multi-Arch: same Section: libs Depends: ${misc:Depends}, ${shlibs:Depends} Provides: libhfstospell +Conflicts: libhfstospell, libhfstospell11 +Replaces: libhfstospell, libhfstospell11 Description: HFST spell checker runtime libraries Minimal HFST optimized lookup format based spell checker library and a demonstrational implementation of command line based spell checker. diff --git a/debian/docs b/debian/docs index 6f83607c14aa1cd7cb72686f7cda5ddd090b09ae..46a4ca7180f4a6e0f569152a8ec634c8ed313ca6 100644 --- a/debian/docs +++ b/debian/docs @@ -1,3 +1,3 @@ AUTHORS NEWS -README +README.md diff --git a/debian/rules b/debian/rules index 2f0ba35b489cc37222707d7900819411b5b73766..5837872ad491794e206776acf582f00c685a0f75 100755 --- a/debian/rules +++ b/debian/rules @@ -5,7 +5,7 @@ export DH_OPTIONS export LC_ALL=C.UTF-8 -export DEB_BUILD_MAINT_OPTIONS = hardening=+all +export "DEB_BUILD_MAINT_OPTIONS=hardening=+all optimize=+lto reproducible=+fixfilepath" DPKG_EXPORT_BUILDFLAGS = 1 include /usr/share/dpkg/buildflags.mk @@ -23,6 +23,3 @@ ifeq ($(filter nocheck,$(DEB_BUILD_OPTIONS)),) override_dh_auto_test: dh_auto_test --no-parallel endif - -override_dh_missing: - dh_missing --fail-missing diff --git a/debian/watch b/debian/watch index 6cbc031339c31b6c2f3b5ef033aca400b137f221..f34cdd277bfcff3e2045c197901d50d68d222531 100644 --- a/debian/watch +++ b/debian/watch @@ -1,3 +1,4 @@ version=4 -https://github.com/hfst/hfst-ospell/releases \ - .*/@PACKAGE@-(\d[\d.]*)\.tar\.bz2 debian uupdate +opts="searchmode=plain" \ + https://api.github.com/repos/hfst/@PACKAGE@/releases \ + https://github.com/hfst/@PACKAGE@/releases/download/v(?:\d[\d.]*)/@PACKAGE@@ANY_VERSION@@ARCHIVE_EXT@ diff --git a/hfst-ol.cc b/hfst-ol.cc index e0a52f43922ba9a1db8dc08da14ad0429cbafc4b..afb6d4e6e0dee850a27f81447fb62eeedcdc2a64 100644 --- a/hfst-ol.cc +++ b/hfst-ol.cc @@ -59,9 +59,9 @@ uint16_t read_uint16_flipping_endianness(FILE * f) uint16_t read_uint16_flipping_endianness(char * raw) { uint16_t result = 0; - result |= *(raw + 1); + result |= static_cast<uint8_t>(*(raw + 1)); result <<= 8; - result |= *raw; + result |= static_cast<uint8_t>(*raw); return result; } @@ -85,13 +85,13 @@ uint32_t read_uint32_flipping_endianness(FILE * f) uint32_t read_uint32_flipping_endianness(char * raw) { uint32_t result = 0; - result |= *(raw + 3); + result |= static_cast<uint8_t>(*(raw + 3)); result <<= 8; - result |= *(raw + 2); + result |= static_cast<uint8_t>(*(raw + 2)); result <<= 8; - result |= *(raw + 1); + result |= static_cast<uint8_t>(*(raw + 1)); result <<= 8; - result |= *raw; + result |= static_cast<uint8_t>(*raw); return result; } diff --git a/hfst-ol.h b/hfst-ol.h index 0a83ec5f387d8379b0ac59a6d8e9654a294fa827..ee59b14e9bb6410d96537962f21da74612867a52 100644 --- a/hfst-ol.h +++ b/hfst-ol.h @@ -28,6 +28,7 @@ #include <climits> #include <cstdio> #include <cstdlib> +#include <cstdint> #include <iostream> #include <cstring> #include <set> diff --git a/main.cc b/main.cc index 70ca402fe16a757ddb4a0a8ab76581e56f739ed6..346b822b52d8ca4025d04566013c74ace70a981b 100644 --- a/main.cc +++ b/main.cc @@ -164,6 +164,10 @@ bool print_short_help(void) void do_suggest(ZHfstOspeller& speller, const std::string& str) { + if (verbose) + { + hfst_fprintf(stdout, "Suggesting for %s:\n", str.c_str()); + } hfst_ospell::CorrectionQueue corrections = speller.suggest(str); if (corrections.size() > 0) { diff --git a/predict.cc b/predict.cc new file mode 100644 index 0000000000000000000000000000000000000000..502e60b5ec558f9a4470b884916ad7ed9c3adbd0 --- /dev/null +++ b/predict.cc @@ -0,0 +1,648 @@ +/* + + Copyright 2022 Flammie A Pirinen + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +*/ + +/* + This is a toy commandline utility for testing spellers on standard io. + */ + +#if HAVE_CONFIG_H +#include <config.h> +#else +#define PACKAGE_NAME +#define PACKAGE_BUGREPORT +#define PACKAGE_STRING +#endif +#if HAVE_GETOPT_H +#include <getopt.h> +#endif + +#ifdef WINDOWS +#include <windows.h> +#endif + +#include <cstdarg> +#include <errno.h> +#include <stdio.h> + +#include "ZHfstOspeller.h" +#include "ol-exceptions.h" +#include "ospell.h" + +using hfst_ospell::Transducer; +using hfst_ospell::ZHfstOspeller; + +static bool quiet = false; +static bool verbose = false; +static bool analyse = false; +static unsigned long suggs = 0; +static hfst_ospell::Weight max_weight = -1.0; +static hfst_ospell::Weight beam = -1.0; +static float time_cutoff = 0.0; +static std::string error_model_filename = ""; +static std::string lexicon_filename = ""; +static std::string continuation_marker = ""; +#ifdef WINDOWS +static bool output_to_console = false; +#endif + +#ifdef WINDOWS +static std::string +wide_string_to_string(const std::wstring &wstr) +{ + int size_needed = WideCharToMultiByte( + CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); + std::string str(size_needed, 0); + WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &str[0], + size_needed, NULL, NULL); + return str; +} +#endif + +// C++20, https://stackoverflow.com/a/2072890/4109773 +inline bool +ends_with(std::string const &value, std::string const &ending) +{ + if (ending.size() > value.size()) + return false; + return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); +} + +static int +hfst_fprintf(FILE *stream, const char *format, ...) +{ + va_list args; + va_start(args, format); +#ifdef WINDOWS + if (output_to_console && (stream == stdout || stream == stderr)) + { + char buffer[1024]; + int r = vsprintf(buffer, format, args); + va_end(args); + if (r < 0) + return r; + HANDLE stdHandle = GetStdHandle(STD_OUTPUT_HANDLE); + if (stream == stderr) + stdHandle = GetStdHandle(STD_ERROR_HANDLE); + + std::string pstr(buffer); + DWORD numWritten = 0; + int wchars_num + = MultiByteToWideChar(CP_UTF8, 0, pstr.c_str(), -1, NULL, 0); + wchar_t *wstr = new wchar_t[wchars_num]; + MultiByteToWideChar(CP_UTF8, 0, pstr.c_str(), -1, wstr, wchars_num); + int retval = WriteConsoleW(stdHandle, wstr, wchars_num - 1, + &numWritten, NULL); + delete[] wstr; + + return retval; + } + else + { + int retval = vfprintf(stream, format, args); + va_end(args); + return retval; + } +#else + errno = 0; + int retval = vfprintf(stream, format, args); + if (retval < 0) + { + perror("hfst_fprintf"); + } + va_end(args); + return retval; +#endif +} + +bool +print_usage(void) +{ + std::cout + << "\n" + << "Usage: " PACKAGE_NAME " [OPTIONS] [ZHFST-ARCHIVE]\n" + << "Use automata in ZHFST-ARCHIVE or from OPTIONS to check and " + "predict\n" + "\n" + << " -h, --help Print this help message\n" + << " -V, --version Print version information\n" + << " -v, --verbose Be verbose\n" + << " -q, --quiet Don't be verbose (default)\n" + << " -s, --silent Same as quiet\n" + << " -a, --analyse Analyse strings and corrections\n" + << " -n, --limit=N Show at most N predictions\n" + << " -w, --max-weight=W Suppress corrections with weights " + "above W\n" + << " -b, --beam=W Suppress corrections worse than best " + "candidate by more than W\n" + << " -t, --time-cutoff=T Stop trying to find better " + "corrections after T seconds (T is a float)\n" + << " -C, --continuation=C Word-continuation character is C\n" + << " -m, --error-model Use this error model (must also give " + "lexicon as option)\n" + << " -l, --lexicon Use this lexicon (must also give erro " + "model as option)\n" + << +#ifdef WINDOWS + " -k, --output-to-console Print output to console " + "(Windows-specific)" + << +#endif + "\n" + << "\n" + << "Report bugs to " PACKAGE_BUGREPORT "\n" + << "\n"; + return true; +} + +bool +print_version(void) +{ + std::cout << "\n" PACKAGE_STRING << std::endl + << "copyright (C) 2009 - 2022 University of Helsinki\n"; + return true; +} + +bool +print_short_help(void) +{ + print_usage(); + return true; +} + +void +do_predict(ZHfstOspeller &speller, const std::string &str) +{ + if (verbose) + { + hfst_fprintf(stdout, "Suggesting for %s:\n", str.c_str()); + } + hfst_ospell::CorrectionQueue corrections = speller.suggest(str); + if (corrections.size() > 0) + { + hfst_fprintf(stdout, "Corrections for \"%s\":\n", str.c_str()); + while (corrections.size() > 0) + { + const std::string &corr = corrections.top().first; + if (analyse) + { + hfst_ospell::AnalysisQueue anals = speller.analyse(corr, true); + bool all_discarded = true; + while (anals.size() > 0) + { + if (anals.top().first.find("Use/SpellNoSugg") + != std::string::npos) + { + hfst_fprintf(stdout, + "%s %f %s " + "[DISCARDED BY ANALYSES]\n", + corr.c_str(), corrections.top().second, + anals.top().first.c_str()); + } + else + { + all_discarded = false; + hfst_fprintf(stdout, "%s %f %s\n", corr.c_str(), + corrections.top().second, + anals.top().first.c_str()); + } + anals.pop(); + } + if (all_discarded) + { + hfst_fprintf(stdout, "All corrections were " + "invalidated by analysis! " + "No score!\n"); + } + } + else + { + if ((!continuation_marker.empty()) + && (ends_with(corr, continuation_marker))) + { + std::string chomped = corr.substr( + 0, corr.size() - continuation_marker.size()); + hfst_fprintf(stdout, "%s... %f (continuation %s)\n", + chomped.c_str(), corrections.top().second, + continuation_marker.c_str()); + } + else + { + hfst_fprintf(stdout, "%s %f\n", corr.c_str(), + corrections.top().second); + } + } + corrections.pop(); + } + hfst_fprintf(stdout, "\n"); + } + else + { + hfst_fprintf(stdout, "Unable to correct \"%s\"!\n\n", str.c_str()); + } +} + +void +do_spell(ZHfstOspeller &speller, const std::string &str) +{ + if (speller.spell(str)) + { + hfst_fprintf(stdout, "\"%s\" is in the lexicon...\n", str.c_str()); + if (analyse) + { + hfst_fprintf(stdout, "analysing:\n"); + hfst_ospell::AnalysisQueue anals = speller.analyse(str, false); + bool all_no_spell = true; + while (anals.size() > 0) + { + if (anals.top().first.find("Use/-Spell") != std::string::npos) + { + hfst_fprintf(stdout, "%s %f [DISCARDED AS -Spell]\n", + anals.top().first.c_str(), + anals.top().second); + } + else + { + all_no_spell = false; + hfst_fprintf(stdout, "%s %f\n", + anals.top().first.c_str(), + anals.top().second); + } + anals.pop(); + } + if (all_no_spell) + { + hfst_fprintf(stdout, + "All spellings were invalidated by analysis! " + ".:. Not in lexicon!\n"); + } + } + hfst_fprintf(stdout, "(but correcting anyways)\n", str.c_str()); + do_predict(speller, str); + } + else + { + hfst_fprintf(stdout, "\"%s\" is NOT in the lexicon:\n", str.c_str()); + do_predict(speller, str); + } +} + +int +zhfst_spell(char *zhfst_filename) +{ + ZHfstOspeller speller; + try + { + speller.read_zhfst(zhfst_filename); + } + catch (hfst_ospell::ZHfstMetaDataParsingError &zhmdpe) + { + hfst_fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n", + zhfst_filename, zhmdpe.what()); + return EXIT_FAILURE; + } + catch (hfst_ospell::ZHfstZipReadingError &zhzre) + { + hfst_fprintf(stderr, + "cannot read zhfst archive %s:\n" + "%s.\n", + zhfst_filename, zhzre.what()); + return EXIT_FAILURE; + } + catch (hfst_ospell::ZHfstXmlParsingError &zhxpe) + { + hfst_fprintf(stderr, + "Cannot finish reading index.xml from %s:\n" + "%s.\n", + zhfst_filename, zhxpe.what()); + return EXIT_FAILURE; + } + if (verbose) + { + hfst_fprintf(stdout, + "Following metadata was read from ZHFST archive:\n" + "%s\n", + speller.metadata_dump().c_str()); + } + speller.set_queue_limit(suggs); + if (suggs != 0 && verbose) + { + hfst_fprintf(stdout, "Printing only %lu top predictions per line\n", + suggs); + } + speller.set_weight_limit(max_weight); + if (max_weight >= 0.0 && verbose) + { + hfst_fprintf(stdout, "Not printing predictions worse than %f\n", + max_weight); + } + speller.set_beam(beam); + if (beam >= 0.0 && verbose) + { + hfst_fprintf(stdout, + "Not printing predictions worse than best by margin %f\n", + beam); + } + speller.set_time_cutoff(time_cutoff); + if (time_cutoff > 0.0 && verbose) + { + hfst_fprintf( + stdout, "Not trying to find better predictions after %f seconds\n", + time_cutoff); + } + if ((!continuation_marker.empty()) && verbose) + { + hfst_fprintf(stdout, "%s marks incomplete words\n", + continuation_marker.c_str()); + } + char *str = (char *)malloc(2000); + +#ifdef WINDOWS + SetConsoleCP(65001); + const HANDLE stdIn = GetStdHandle(STD_INPUT_HANDLE); + WCHAR buffer[0x1000]; + DWORD numRead = 0; + while (ReadConsoleW(stdIn, buffer, sizeof buffer, &numRead, NULL)) + { + std::wstring wstr(buffer, numRead - 1); // skip the newline + std::string linestr = wide_string_to_string(wstr); + free(str); + str = strdup(linestr.c_str()); +#else + while (!std::cin.eof()) + { + std::cin.getline(str, 2000); +#endif + if (str[0] == '\0') + { + continue; + } + if (str[strlen(str) - 1] == '\r') + { +#ifdef WINDOWS + str[strlen(str) - 1] = '\0'; +#else + hfst_fprintf(stderr, "There is a WINDOWS linebreak in this file\n" + "Please convert with dos2unix or fromdos\n"); + exit(1); +#endif + } + do_spell(speller, str); + } + free(str); + return EXIT_SUCCESS; +} + +int +legacy_spell(hfst_ospell::Speller *s) +{ + ZHfstOspeller speller; + speller.inject_speller(s); + speller.set_queue_limit(suggs); + if (suggs != 0 && verbose) + { + hfst_fprintf(stdout, "Printing only %lu top predictions per line\n", + suggs); + } + speller.set_weight_limit(max_weight); + if (max_weight >= 0.0 && verbose) + { + hfst_fprintf(stdout, "Not printing predictions worse than %f\n", + suggs); + } + speller.set_beam(beam); + if (beam >= 0.0 && verbose) + { + hfst_fprintf(stdout, + "Not printing predictions worse than best by margin %f\n", + suggs); + } + char *str = (char *)malloc(2000); + +#ifdef WINDOWS + SetConsoleCP(65001); + const HANDLE stdIn = GetStdHandle(STD_INPUT_HANDLE); + WCHAR buffer[0x1000]; + DWORD numRead = 0; + while (ReadConsoleW(stdIn, buffer, sizeof buffer, &numRead, NULL)) + { + std::wstring wstr(buffer, numRead - 1); // skip the newline + std::string linestr = wide_string_to_string(wstr); + free(str); + str = strdup(linestr.c_str()); +#else + while (!std::cin.eof()) + { + std::cin.getline(str, 2000); +#endif + if (str[0] == '\0') + { + continue; + } + if (str[strlen(str) - 1] == '\r') + { +#ifdef WINDOWS + str[strlen(str) - 1] = '\0'; +#else + hfst_fprintf(stderr, "There is a WINDOWS linebreak in this file\n" + "Please convert with dos2unix or fromdos\n"); + exit(1); +#endif + } + do_spell(speller, str); + } + free(str); + return EXIT_SUCCESS; +} + +int +main(int argc, char **argv) +{ + +#if HAVE_GETOPT_H + int c; + // std::locale::global(std::locale("")); + while (true) + { + static struct option long_options[] + = { // first the hfst-mandated options + { "help", no_argument, 0, 'h' }, + { "version", no_argument, 0, 'V' }, + { "verbose", no_argument, 0, 'v' }, + { "quiet", no_argument, 0, 'q' }, + { "silent", no_argument, 0, 's' }, + { "analyse", no_argument, 0, 'a' }, + { "limit", required_argument, 0, 'n' }, + { "max-weight", required_argument, 0, 'w' }, + { "beam", required_argument, 0, 'b' }, + { "time-cutoff", required_argument, 0, 't' }, + { "real-word", no_argument, 0, 'X' }, + { "error-model", required_argument, 0, 'm' }, + { "lexicon", required_argument, 0, 'l' }, + { "continuation", required_argument, 0, 'C' }, +#ifdef WINDOWS + { "output-to-console", no_argument, 0, 'k' }, +#endif + { 0, 0, 0, 0 } + }; + + int option_index = 0; + c = getopt_long(argc, argv, "hVvqsan:w:b:t:SXm:l:kC:", long_options, + &option_index); + char *endptr = 0; + + if (c == -1) // no more options to look at + break; + + switch (c) + { + case 'h': + print_usage(); + return EXIT_SUCCESS; + break; + + case 'V': + print_version(); + return EXIT_SUCCESS; + break; + + case 'v': + verbose = true; + quiet = false; + break; + + case 'q': // fallthrough + case 's': + quiet = true; + verbose = false; + break; + case 'a': + analyse = true; + break; + case 'n': + suggs = strtoul(optarg, &endptr, 10); + if (endptr == optarg) + { + fprintf(stderr, "%s not a strtoul number\n", optarg); + exit(1); + } + else if (*endptr != '\0') + { + fprintf(stderr, "%s truncated from limit parameter\n", endptr); + } + break; + case 'w': + max_weight = strtof(optarg, &endptr); + if (endptr == optarg) + { + fprintf(stderr, "%s is not a float\n", optarg); + exit(1); + } + else if (*endptr != '\0') + { + fprintf(stderr, "%s truncated from limit parameter\n", endptr); + } + + break; + case 'b': + beam = strtof(optarg, &endptr); + if (endptr == optarg) + { + fprintf(stderr, "%s is not a float\n", optarg); + exit(1); + } + else if (*endptr != '\0') + { + fprintf(stderr, "%s truncated from limit parameter\n", endptr); + } + + break; + case 't': + time_cutoff = strtof(optarg, &endptr); + if (endptr == optarg) + { + fprintf(stderr, "%s is not a float\n", optarg); + exit(1); + } + else if (*endptr != '\0') + { + fprintf(stderr, "%s truncated from limit parameter\n", endptr); + } + + break; +#ifdef WINDOWS + case 'k': + output_to_console = true; + break; +#endif + case 'm': + error_model_filename = optarg; + break; + case 'l': + lexicon_filename = optarg; + break; + case 'C': + continuation_marker = optarg; + break; + default: + std::cerr << "Invalid option\n\n"; + print_short_help(); + return EXIT_FAILURE; + break; + } + } +#else + int optind = 1; +#endif + // no more options, we should now be at the input filenames + if (optind == (argc - 1)) + { + if (error_model_filename != "" || lexicon_filename != "") + { + std::cerr << "Give *either* a zhfst speller or --error-model and " + "--lexicon" + << std::endl; + print_short_help(); + return EXIT_FAILURE; + } + return zhfst_spell(argv[optind]); + } + else if (optind < (argc - 1)) + { + std::cerr << "Too many file parameters" << std::endl; + print_short_help(); + return EXIT_FAILURE; + } + else if (optind >= argc) + { + if (error_model_filename == "" || lexicon_filename == "") + { + std::cerr << "Give *either* a zhfst speller or --error-model and " + "--lexicon" + << std::endl; + print_short_help(); + return EXIT_FAILURE; + } + FILE *err_file = fopen(error_model_filename.c_str(), "r"); + FILE *lex_file = fopen(lexicon_filename.c_str(), "r"); + hfst_ospell::Transducer err(err_file); + hfst_ospell::Transducer lex(lex_file); + hfst_ospell::Speller *s = new hfst_ospell::Speller(&err, &lex); + return legacy_spell(s); + } + return EXIT_SUCCESS; +}