diff --git a/configure.ac b/configure.ac index 44ae27dd693a694264cd2a53887eebe97f65b657..bc5fc9dbee690edfc550db765f5d0fc9db14f504 100644 --- a/configure.ac +++ b/configure.ac @@ -17,7 +17,7 @@ # autoconf requirements AC_PREREQ([2.62]) -AC_INIT([hfstospell], [0.5.2], [hfst-bugs@helsinki.fi], [hfstospell], [http://hfst.github.io]) +AC_INIT([hfstospell], [0.5.3], [hfst-bugs@helsinki.fi], [hfstospell], [http://hfst.github.io]) LT_PREREQ([2.2.6]) @@ -34,7 +34,7 @@ AC_CONFIG_HEADERS([config.h]) HFSTOSPELL_NAME=hfstospell HFSTOSPELL_MAJOR=0 HFSTOSPELL_MINOR=5 -HFSTOSPELL_EXTENSION=.2 +HFSTOSPELL_EXTENSION=.3 HFSTOSPELL_VERSION=$HFSTOSPELL_MAJOR.$HFSTOSPELL_MINOR$HFSTOSPELL_EXTENSION AC_SUBST(HFSTOSPELL_MAJOR) AC_SUBST(HFSTOSPELL_MINOR) diff --git a/debian/changelog b/debian/changelog index 9742f2c0906c3bfe76e642f30f1ceddeaae2d3c7..769949ded935f2deffa426d9e03b37238d0ea003 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,19 @@ +hfst-ospell (0.5.3-1) unstable; urgency=low + + [ Tino Didriksen ] + * Update to latest upstream. + + [ Debian Janitor ] + * Set upstream metadata fields: Bug-Database, Bug-Submit, Repository, + Repository-Browse. + * Drop unnecessary dependency on dh-autoreconf. + + [ Kartik Mistry ] + * debian/control: + + Updated Standards-Version to 4.6.0 + + -- Kartik Mistry <kartik@debian.org> Sat, 05 Mar 2022 22:03:44 +0530 + hfst-ospell (0.5.2-1) unstable; urgency=low [ Tino Didriksen ] diff --git a/debian/control b/debian/control index 16392ec1663cf34ba70d174d1d60e6e104170e10..b27d6218505838fe454f322bfdd6e3050e77f6cf 100644 --- a/debian/control +++ b/debian/control @@ -10,7 +10,7 @@ Build-Depends: autoconf, libicu-dev, pkg-config, zip -Standards-Version: 4.5.0 +Standards-Version: 4.6.0 Homepage: https://github.com/hfst/hfst-ospell Vcs-Git: https://salsa.debian.org/science-team/hfst-ospell.git Vcs-Browser: https://salsa.debian.org/science-team/hfst-ospell @@ -31,8 +31,6 @@ Multi-Arch: same Section: libs Depends: ${misc:Depends}, ${shlibs:Depends} Provides: libhfstospell -Conflicts: libhfstospell, libhfstospell10 -Replaces: libhfstospell, libhfstospell10 Description: HFST spell checker runtime libraries Minimal HFST optimized lookup format based spell checker library and a demonstrational implementation of command line based spell checker. diff --git a/debian/copyright b/debian/copyright index 9019ad34ca2ecface894ed541326f680e665dbaa..dfe70ed2aded09ddf018f1c3f9640db140365383 100644 --- a/debian/copyright +++ b/debian/copyright @@ -27,7 +27,7 @@ License: GPL-3+ GNU General Public License for more details. . You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. + along with this program. If not, see <https://www.gnu.org/licenses/>. . On Debian systems, the complete text of the GNU General Public License version 3 can be found in "/usr/share/common-licenses/GPL-3". diff --git a/debian/rules b/debian/rules index ba383b91466f72e40bd5490b5ec28bbad528b113..2f0ba35b489cc37222707d7900819411b5b73766 100755 --- a/debian/rules +++ b/debian/rules @@ -10,7 +10,7 @@ DPKG_EXPORT_BUILDFLAGS = 1 include /usr/share/dpkg/buildflags.mk %: - dh $@ --with autoreconf + dh $@ override_dh_auto_configure: dh_auto_configure -- --disable-static --enable-zhfst --without-libxmlpp --without-tinyxml2 @@ -21,7 +21,7 @@ override_dh_auto_install: ifeq ($(filter nocheck,$(DEB_BUILD_OPTIONS)),) override_dh_auto_test: - make -j1 check + dh_auto_test --no-parallel endif override_dh_missing: diff --git a/debian/upstream/metadata b/debian/upstream/metadata new file mode 100644 index 0000000000000000000000000000000000000000..f5f3cae230f6bc487bad69c54a024040bbf5f21d --- /dev/null +++ b/debian/upstream/metadata @@ -0,0 +1,5 @@ +--- +Bug-Database: https://github.com/hfst/hfst-ospell/issues +Bug-Submit: https://github.com/hfst/hfst-ospell/issues/new +Repository: https://github.com/hfst/hfst-ospell.git +Repository-Browse: https://github.com/hfst/hfst-ospell diff --git a/debian/watch b/debian/watch index 3c8857b24e470d03129bee6dcd6cd45e5f8646d7..6cbc031339c31b6c2f3b5ef033aca400b137f221 100644 --- a/debian/watch +++ b/debian/watch @@ -1,4 +1,3 @@ version=4 -opts="filenamemangle=s%(?:.*?)?v?(\d[\d.]*)\.tar\.gz%hfst-ospell-$1.tar.gz%" \ - https://github.com/hfst/hfst-ospell/tags \ - (?:.*?/)?v?(\d[\d.]*)\.tar\.gz debian uupdate +https://github.com/hfst/hfst-ospell/releases \ + .*/@PACKAGE@-(\d[\d.]*)\.tar\.bz2 debian uupdate diff --git a/hfst-ol.cc b/hfst-ol.cc index 04d67430a30881e675d4981c7314ad8bf5c7339c..e0a52f43922ba9a1db8dc08da14ad0429cbafc4b 100644 --- a/hfst-ol.cc +++ b/hfst-ol.cc @@ -177,8 +177,8 @@ void TransducerHeader::skip_hfst3_header(FILE * f) HFSTOSPELL_THROW_MESSAGE(HeaderParsingException, "Found broken HFST3 header\n"); } - char * headervalue = new char[remaining_header_len]; - if (fread(headervalue, remaining_header_len, 1, f) != 1) + std::string headervalue(remaining_header_len, '\0'); + if (fread(&headervalue[0], remaining_header_len, 1, f) != 1) { HFSTOSPELL_THROW_MESSAGE(HeaderParsingException, "HFST3 header ended unexpectedly\n"); @@ -187,12 +187,10 @@ void TransducerHeader::skip_hfst3_header(FILE * f) HFSTOSPELL_THROW_MESSAGE(HeaderParsingException, "Found broken HFST3 header\n"); } - std::string header_tail(headervalue, remaining_header_len); - size_t type_field = header_tail.find("type"); + auto type_field = headervalue.find("type"); if (type_field != std::string::npos) { - if (header_tail.find("HFST_OL") != type_field + 5 && - header_tail.find("HFST_OLW") != type_field + 5) { - delete[] headervalue; + if (headervalue.find("HFST_OL") != type_field + 5 && + headervalue.find("HFST_OLW") != type_field + 5) { HFSTOSPELL_THROW_MESSAGE( TransducerTypeException, "Transducer has incorrect type, should be " @@ -809,7 +807,7 @@ void Encoder::read_input_symbol(const char * s, const int s_num) // If this is shadowed by an ascii symbol, unshadow ascii_symbols[(unsigned char)(*s)] = NO_SYMBOL; } - + letters.add_string(s, static_cast<SymbolNumber>(s_num)); } diff --git a/main.cc b/main.cc index e4fd676243b8b39989f12f3646694601c9e2fecf..70ca402fe16a757ddb4a0a8ab76581e56f739ed6 100644 --- a/main.cc +++ b/main.cc @@ -1,19 +1,19 @@ /* - + Copyright 2009 University of Helsinki - + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. - + */ /* @@ -165,7 +165,7 @@ void do_suggest(ZHfstOspeller& speller, const std::string& str) { hfst_ospell::CorrectionQueue corrections = speller.suggest(str); - if (corrections.size() > 0) + if (corrections.size() > 0) { hfst_fprintf(stdout, "Corrections for \"%s\":\n", str.c_str()); while (corrections.size() > 0) @@ -181,7 +181,7 @@ do_suggest(ZHfstOspeller& speller, const std::string& str) std::string::npos) { hfst_fprintf(stdout, "%s %f %s " - "[DISCARDED BY ANALYSES]\n", + "[DISCARDED BY ANALYSES]\n", corr.c_str(), corrections.top().second, anals.top().first.c_str()); } @@ -203,8 +203,8 @@ do_suggest(ZHfstOspeller& speller, const std::string& str) } else { - hfst_fprintf(stdout, "%s %f\n", - corr.c_str(), + hfst_fprintf(stdout, "%s %f\n", + corr.c_str(), corrections.top().second); } corrections.pop(); @@ -222,7 +222,7 @@ do_suggest(ZHfstOspeller& speller, const std::string& str) void do_spell(ZHfstOspeller& speller, const std::string& str) { - if (speller.spell(str)) + if (speller.spell(str)) { hfst_fprintf(stdout, "\"%s\" is in the lexicon...\n", str.c_str()); @@ -251,7 +251,7 @@ do_spell(ZHfstOspeller& speller, const std::string& str) } if (all_no_spell) { - hfst_fprintf(stdout, + hfst_fprintf(stdout, "All spellings were invalidated by analysis! " ".:. Not in lexicon!\n"); } @@ -281,43 +281,33 @@ zhfst_spell(char* zhfst_filename) { speller.read_zhfst(zhfst_filename); } - catch (hfst_ospell::ZHfstMetaDataParsingError zhmdpe) + catch (hfst_ospell::ZHfstMetaDataParsingError& zhmdpe) { - hfst_fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n", + hfst_fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n", zhfst_filename, zhmdpe.what()); - //std::cerr << "cannot finish reading zhfst archive " << zhfst_filename << - // ":\n" << zhmdpe.what() << "." << std::endl; return EXIT_FAILURE; } - catch (hfst_ospell::ZHfstZipReadingError zhzre) + catch (hfst_ospell::ZHfstZipReadingError& zhzre) { - //std::cerr << "cannot read zhfst archive " << zhfst_filename << ":\n" - // << zhzre.what() << "." << std::endl - // << "trying to read as legacy automata directory" << std::endl; - hfst_fprintf(stderr, + hfst_fprintf(stderr, "cannot read zhfst archive %s:\n" "%s.\n", zhfst_filename, zhzre.what()); return EXIT_FAILURE; } - catch (hfst_ospell::ZHfstXmlParsingError zhxpe) + catch (hfst_ospell::ZHfstXmlParsingError& zhxpe) { - //std::cerr << "Cannot finish reading index.xml from " - // << zhfst_filename << ":" << std::endl - // << zhxpe.what() << "." << std::endl; - hfst_fprintf(stderr, + hfst_fprintf(stderr, "Cannot finish reading index.xml from %s:\n" - "%s.\n", + "%s.\n", zhfst_filename, zhxpe.what()); return EXIT_FAILURE; } if (verbose) { - //std::cout << "Following metadata was read from ZHFST archive:" << std::endl - // << speller.metadata_dump() << std::endl; - hfst_fprintf(stdout, + hfst_fprintf(stdout, "Following metadata was read from ZHFST archive:\n" - "%s\n", + "%s\n", speller.metadata_dump().c_str()); } speller.set_queue_limit(suggs); @@ -354,7 +344,7 @@ zhfst_spell(char* zhfst_filename) std::string linestr = wide_string_to_string(wstr); free(str); str = strdup(linestr.c_str()); -#else +#else while (!std::cin.eof()) { std::cin.getline(str, 2000); #endif @@ -398,7 +388,7 @@ int hfst_fprintf(stdout, "Not printing suggestions worse than best by margin %f\n", suggs); } char * str = (char*) malloc(2000); - + #ifdef WINDOWS SetConsoleCP(65001); const HANDLE stdIn = GetStdHandle(STD_INPUT_HANDLE); @@ -410,7 +400,7 @@ int std::string linestr = wide_string_to_string(wstr); free(str); str = strdup(linestr.c_str()); -#else +#else while (!std::cin.eof()) { std::cin.getline(str, 2000); #endif @@ -435,11 +425,11 @@ int int main(int argc, char **argv) { - + + +#if HAVE_GETOPT_H int c; //std::locale::global(std::locale("")); - -#if HAVE_GETOPT_H while (true) { static struct option long_options[] = { @@ -463,7 +453,7 @@ int main(int argc, char **argv) #endif {0, 0, 0, 0 } }; - + int option_index = 0; c = getopt_long(argc, argv, "hVvqsan:w:b:t:SXm:l:k", long_options, &option_index); char* endptr = 0; @@ -476,17 +466,17 @@ int main(int argc, char **argv) print_usage(); return EXIT_SUCCESS; break; - + case 'V': print_version(); return EXIT_SUCCESS; break; - + case 'v': verbose = true; quiet = false; break; - + case 'q': // fallthrough case 's': quiet = true; @@ -550,7 +540,7 @@ int main(int argc, char **argv) case 'k': output_to_console = true; break; -#endif +#endif case 'S': suggest = true; break; diff --git a/office.cc b/office.cc index 822e55ac7f8eb096296a0862ec13480e75d3cc23..5962e972ad0c83cd542bd5cfafe086d43ae1f6cf 100644 --- a/office.cc +++ b/office.cc @@ -21,16 +21,16 @@ */ /* - Tests up to 16 variations of each input token: + Tests up to 8 variations of each input token: - Verbatim - With leading non-alphanumerics removed - With trailing non-alphanumerics removed - With leading and trailing non-alphanumerics removed - - Lower-case of all the above - - First-upper of all the above + - First-lower of all the above */ #include <iostream> +#include <iomanip> #include <fstream> #include <vector> #include <string> @@ -42,6 +42,7 @@ #include <cmath> #include <cerrno> #include <cctype> +#include <getopt.h> #define U_CHARSET_IS_UTF8 1 #include <unicode/uclean.h> @@ -64,13 +65,18 @@ struct word_t { UnicodeString buffer; }; std::vector<word_t> words(16); -std::string buffer; -std::vector<std::string> alts; +std::string buffer, wbuf; +using Alt = std::pair<double,std::string>; +std::vector<Alt> alts; std::unordered_set<std::string> outputs; UnicodeString ubuffer, uc_buffer; size_t cw; bool verbatim = false; +bool debug = false; +hfst_ospell::Weight max_weight = -1.0; +hfst_ospell::Weight beam = -1.0; +float time_cutoff = 6.0; bool uc_first = false; bool uc_all = true; @@ -82,14 +88,18 @@ bool find_alternatives(ZHfstOspeller& speller, size_t suggs) { for (size_t k=0 ; k < cw && alts.size()<suggs ; ++k) { buffer.clear(); words[k].buffer.toUTF8String(buffer); - hfst_ospell::CorrectionQueue corrections = speller.suggest(buffer); + auto corrections = speller.suggest(buffer); if (corrections.size() == 0) { continue; } - // Because speller.set_queue_limit() doesn't actually work, hard limit it here - for (size_t i=0, e=corrections.size() ; i<e && alts.size()<suggs ; ++i) { + for (size_t i=0, e=corrections.size() ; i<e ; ++i) { + // Work around https://github.com/hfst/hfst-ospell/issues/54 + if (max_weight > 0.0 && corrections.top().second > max_weight) { + break; + } + auto w = corrections.top().second * (1.0 + k/10.0); buffer.clear(); if (k != 0) { @@ -112,8 +122,18 @@ bool find_alternatives(ZHfstOspeller& speller, size_t suggs) { words[0].buffer.tempSubString(words[k].start + words[k].count).toUTF8String(buffer); } + if (debug) { + wbuf.resize(64); + wbuf.resize(sprintf(&wbuf[0], " (%.2f;%zu)", corrections.top().second, k)); + buffer += wbuf; + } + if (outputs.count(buffer) == 0) { - alts.push_back(buffer); + alts.push_back({w, buffer}); + std::sort(alts.begin(), alts.end()); + while (alts.size() > suggs) { + alts.pop_back(); + } } outputs.insert(buffer); corrections.pop(); @@ -123,7 +143,7 @@ bool find_alternatives(ZHfstOspeller& speller, size_t suggs) { if (!alts.empty()) { std::cout << "&"; for (auto& alt : alts) { - std::cout << "\t" << alt; + std::cout << "\t" << alt.second; } std::cout << std::endl; return true; @@ -167,7 +187,7 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs } size_t ichStart = 0, cchUse = ubuffer.length(); - const UChar *pwsz = ubuffer.getTerminatedBuffer(); + auto pwsz = ubuffer.getTerminatedBuffer(); // Always test the full given input words[0].buffer.remove(); @@ -216,7 +236,7 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs for (size_t i=0, e=cw ; i<e ; ++i) { // If we are looking for suggestions, don't use the cache - valid_words_t::iterator it = suggs ? valid_words.end() : valid_words.find(words[i].buffer); + auto it = suggs ? valid_words.end() : valid_words.find(words[i].buffer); if (it == valid_words.end()) { buffer.clear(); @@ -224,49 +244,21 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs bool valid = speller.spell(buffer); it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first; - if (!valid && !verbatim) { - // If the word was not valid, fold it to lower case and try again - buffer.clear(); - ubuffer = words[i].buffer; - ubuffer.toLower(); - ubuffer.toUTF8String(buffer); - - // Add the lower case variant to the list so that we get suggestions using that, if need be - words[cw].start = words[i].start; - words[cw].count = words[i].count; - words[cw].buffer = ubuffer; - ++cw; - - // Don't try again if the lower cased variant has already been tried - valid_words_t::iterator itl = suggs ? valid_words.end() : valid_words.find(ubuffer); - if (itl != valid_words.end()) { - it->second = itl->second; - it = itl; - } - else { - valid = speller.spell(buffer); - it->second = valid; // Also mark the original mixed case variant as whatever the lower cased one was - it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first; - } - } - - if (!valid && !verbatim && (uc_all || uc_first)) { - // If the word was still not valid but had upper case, try a first-upper variant + if (!valid && !verbatim && uc_first) { + // If the word was not valid, try a first-lower variant buffer.clear(); ubuffer.setTo(words[i].buffer, 0, 1); - ubuffer.toUpper(); - uc_buffer.setTo(words[i].buffer, 1); - uc_buffer.toLower(); - ubuffer.append(uc_buffer); + ubuffer.toLower(); + ubuffer.append(words[i].buffer, 1, words[i].buffer.length() - 1); ubuffer.toUTF8String(buffer); - // Add the first-upper variant to the list so that we get suggestions using that, if need be + // Add the first-lower case variant to the list so that we get suggestions using that, if need be words[cw].start = words[i].start; words[cw].count = words[i].count; words[cw].buffer = ubuffer; ++cw; - // Don't try again if the first-upper variant has already been tried + // Don't try again if the first-lower variant has already been tried valid_words_t::iterator itl = suggs ? valid_words.end() : valid_words.find(ubuffer); if (itl != valid_words.end()) { it->second = itl->second; @@ -274,7 +266,7 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs } else { valid = speller.spell(buffer); - it->second = valid; // Also mark the original mixed case variant as whatever the first-upper one was + it->second = valid; // Also mark the original mixed case variant as whatever the first-lower one was it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first; } } @@ -291,8 +283,13 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs int zhfst_spell(const char* zhfst_filename) { ZHfstOspeller speller; try { + if (debug) { + std::cout << "@@ Loading " << zhfst_filename << " with args max-weight=" << max_weight << ", beam=" << beam << ", time-cutoff=" << time_cutoff << std::endl; + } speller.read_zhfst(zhfst_filename); - speller.set_time_cutoff(6.0); + speller.set_weight_limit(max_weight); + speller.set_beam(beam); + speller.set_time_cutoff(time_cutoff); } catch (hfst_ospell::ZHfstMetaDataParsingError zhmdpe) { fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n", zhfst_filename, zhmdpe.what()); @@ -319,6 +316,38 @@ int zhfst_spell(const char* zhfst_filename) { if (line.empty()) { continue; } + + if (line.size() >= 5 && line[0] == '$' && line[1] == '$' && line[3] == ' ') { + if (line[2] == 'd' && isdigit(line[4]) && line.size() == 5) { + debug = (line[4] != '0'); + std::cout << "@@ Option debug changed to " << debug << std::endl; + continue; + } + if (line[2] == 'T' && isdigit(line[4]) && line.size() == 5) { + verbatim = (line[4] != '0'); + std::cout << "@@ Option verbatim changed to " << verbatim << std::endl; + continue; + } + if (line[2] == 'w' && isdigit(line[4])) { + max_weight = std::stof(&line[4]); + speller.set_weight_limit(max_weight); + std::cout << "@@ Option max-weight changed to " << max_weight << std::endl; + continue; + } + if (line[2] == 'b' && isdigit(line[4])) { + beam = std::stof(&line[4]); + speller.set_beam(beam); + std::cout << "@@ Option beam changed to " << beam << std::endl; + continue; + } + if (line[2] == 't' && isdigit(line[4])) { + time_cutoff = std::stof(&line[4]); + speller.set_time_cutoff(time_cutoff); + std::cout << "@@ Option time-cutoff changed to " << time_cutoff << std::endl; + continue; + } + } + // Just in case anyone decides to use the speller for a minor eternity if (valid_words.size() > 20480) { valid_words.clear(); @@ -345,6 +374,19 @@ int zhfst_spell(const char* zhfst_filename) { return EXIT_SUCCESS; } +void print_help() { + std::cout + << "Usage: hfst-ospell [options] zhfst-archive\n" + << "\n" + << " -h, --help Shows this help\n" + << " -d, --debug Debug output with weights attached to results\n" + << " -T, --verbatim Disables case-folding and non-alphanumeric trimming\n" + << " -w, --max-weight=W Suppress corrections with weights above W\n" + << " -b, --beam=W Suppress corrections worse than best candidate by more than W\n" + << " -t, --time-cutoff=T Stop trying to find better corrections after T seconds; defaults to 6.0\n" + << std::flush; +} + int main(int argc, char **argv) { UErrorCode status = U_ZERO_ERROR; u_init(&status); @@ -356,22 +398,60 @@ int main(int argc, char **argv) { ucnv_setDefaultName("UTF-8"); uloc_setDefault("en_US_POSIX", &status); - std::vector<std::string> args(argv, argv+argc); - for (std::vector<std::string>::iterator it=args.begin() ; it != args.end() ; ) { - if (*it == "--verbatim") { - verbatim = true; - it = args.erase(it); + struct option long_options[] = + { + {"help", no_argument, 0, 'h'}, + {"debug", no_argument, 0, 'd'}, + {"verbatim", no_argument, 0, 'T'}, + {"max-weight", required_argument, 0, 'w'}, + {"beam", required_argument, 0, 'b'}, + {"time-cutoff", required_argument, 0, 't'}, + {0, 0, 0, 0 } + }; + + int c = 0; + while (true) { + int option_index = 0; + c = getopt_long(argc, argv, "hdTw:b:t:", long_options, &option_index); + + if (c == -1) { + break; } - else { - ++it; + + switch (c) { + case 'h': + print_help(); + return EXIT_SUCCESS; + + case 'd': + debug = true; + break; + + case 'T': + verbatim = true; + break; + + case 'w': + max_weight = std::stof(optarg); + break; + + case 'b': + beam = std::stof(optarg); + break; + + case 't': + time_cutoff = std::stof(optarg); + break; } } - if (args.size() < 2) { + if (optind >= argc) { throw std::invalid_argument("Must pass a zhfst as argument"); } - int rv = zhfst_spell(args[1].c_str()); + std::cerr << std::fixed << std::setprecision(2); + std::cout << std::fixed << std::setprecision(2); + int rv = zhfst_spell(argv[optind]); u_cleanup(); return rv; diff --git a/ol-exceptions.h b/ol-exceptions.h index 2e06a6bc856b6d5851069b18000413103b993516..61619d88950d98cfaa782af8ec4591a867873745 100644 --- a/ol-exceptions.h +++ b/ol-exceptions.h @@ -4,6 +4,7 @@ #include "hfstol-stdafx.h" #include <string> #include <sstream> +#include <cstring> namespace hfst_ospell { @@ -21,7 +22,7 @@ struct OspellException size_t line; //!< line number of exception OspellException(void) {} - + //! //! construct exception with name, file and location OspellException(const std::string &name,const std::string &file,size_t line): @@ -29,7 +30,7 @@ struct OspellException file(file), line(line) {} - + //! //! create string representation of exception for output std::string operator() (void) const @@ -45,7 +46,7 @@ struct OspellException { std::ostringstream o; o << file << ":" << line << ":" << name; - return o.str().c_str(); + return strdup(o.str().c_str()); } }; @@ -59,7 +60,7 @@ struct OspellException #define HFSTOSPELL_EXCEPTION_CHILD_DECLARATION(CHILD) \ struct CHILD : public OspellException \ { CHILD(const std::string &name,const std::string &file,size_t line):\ - OspellException(name,file,line) {}} + OspellException(name,file,line) {}} #define HFST_CATCH(E) \ catch (const E &e) \ diff --git a/ospell.cc b/ospell.cc index 1bde28d276e07e47c6e5aad5f719f8037620c4a3..bee1b948af5512a715d9b89d13c5f869d3e4a962 100644 --- a/ospell.cc +++ b/ospell.cc @@ -152,31 +152,31 @@ TreeNode TreeNode::update(SymbolNumber symbol, bool TreeNode::try_compatible_with(FlagDiacriticOperation op) { switch (op.Operation()) { - + case P: // positive set flag_state[op.Feature()] = op.Value(); return true; - + case N: // negative set (literally, in this implementation) flag_state[op.Feature()] = -1*op.Value(); return true; - + case R: // require if (op.Value() == 0) { // "plain" require, return false if unset return (flag_state[op.Feature()] != 0); } return (flag_state[op.Feature()] == op.Value()); - + case D: // disallow if (op.Value() == 0) { // "plain" disallow, return true if unset return (flag_state[op.Feature()] == 0); } return (flag_state[op.Feature()] != op.Value()); - + case C: // clear flag_state[op.Feature()] = 0; return true; - + case U: // unification /* if the feature is unset OR the feature is to this value already OR the feature is negatively set to something else than this value */ @@ -190,7 +190,7 @@ bool TreeNode::try_compatible_with(FlagDiacriticOperation op) } return false; } - + return false; // to make the compiler happy } @@ -204,7 +204,11 @@ Speller::Speller(Transducer* mutator_ptr, Transducer* lexicon_ptr): alphabet_translator(SymbolVector()), operations(lexicon->get_operations()), limiting(None), - mode(Correct) + mode(Correct), + max_time(-1.0), + start_clock(0), + call_counter(0), + limit_reached(false) { if (mutator != NULL) { build_alphabet_translator(); @@ -228,7 +232,7 @@ void Speller::lexicon_epsilons(void) } TransitionTableIndex next = lexicon->next(next_node.lexicon_state, 0); STransition i_s = lexicon->take_epsilons_and_flags(next); - + while (i_s.symbol != NO_SYMBOL) { if (is_under_weight_limit(next_node.weight + i_s.weight)) { if (lexicon->transitions.input_symbol(next) == 0) { @@ -326,7 +330,7 @@ void Speller::mutator_epsilons(void) } TransitionTableIndex next_m = mutator->next(next_node.mutator_state, 0); STransition mutator_i_s = mutator->take_epsilons(next_m); - + while (mutator_i_s.symbol != NO_SYMBOL) { if (mutator_i_s.symbol == 0) { if (is_under_weight_limit( @@ -460,12 +464,9 @@ bool Transducer::initialize_input_vector(SymbolVector & input_vector, char * line) { input_vector.clear(); - SymbolNumber k = NO_SYMBOL; char ** inpointer = &line; - char * oldpointer; while (**inpointer != '\0') { - oldpointer = *inpointer; - k = encoder->find_key(inpointer); + SymbolNumber k = encoder->find_key(inpointer); if (k == NO_SYMBOL) { // no tokenization from alphabet // for real handling of other and identity for unseen symbols, // use the Speller interface analyse()! @@ -532,18 +533,18 @@ AnalysisQueue Transducer::lookup(char * line) i_s = take_epsilons_and_flags(next_index); } } - + // input consumption loop unsigned int input_state = next_node.input_state; if (input_state < input.size() && has_transitions( next_node.lexicon_state + 1, input[input_state])) { - + next_index = next(next_node.lexicon_state, input[input_state]); STransition i_s = take_non_epsilons(next_index, input[input_state]); - + while (i_s.symbol != NO_SYMBOL) { queue.push_back(next_node.update( i_s.symbol, @@ -551,18 +552,18 @@ AnalysisQueue Transducer::lookup(char * line) next_node.mutator_state, i_s.index, i_s.weight)); - + ++next_index; i_s = take_non_epsilons(next_index, input[input_state]); } } - + } - + for (auto& it : outputs) { analyses.push(StringWeightPair(it.first, it.second)); } - + return analyses; } @@ -729,7 +730,7 @@ Weight Transducer::final_weight(const TransitionTableIndex i) const bool Transducer::is_flag(const SymbolNumber symbol) { - return alphabet.is_flag(symbol); + return alphabet.is_flag(symbol); } bool @@ -888,7 +889,7 @@ CorrectionQueue Speller::correct(char * line, int nbest, std::map<std::string, Weight> corrections; SymbolNumber first_input = (input.size() == 0) ? 0 : input[0]; if (cache[first_input].empty) { - build_cache(first_input); + build_cache(first_input); // XXX: cache corrupts limit! } if (input.size() <= 1) { // get the cached results and we're done @@ -908,6 +909,7 @@ CorrectionQueue Speller::correct(char * line, int nbest, } } } + set_limiting_behaviour(nbest, maxweight, beam); adjust_weight_limits(nbest, beam); for(auto& it : *results) { // Then collect the results @@ -946,6 +948,7 @@ CorrectionQueue Speller::correct(char * line, int nbest, */ next_node = queue.back(); queue.pop_back(); + set_limiting_behaviour(nbest, maxweight, beam); // XXX: need to reset adjust_weight_limits(nbest, beam); // if we can't get an acceptable result, never mind if (next_node.weight > limit) { @@ -1005,6 +1008,7 @@ CorrectionQueue Speller::correct(char * line, int nbest, } } } + //cache[first_input].clear(); return correction_queue; } @@ -1031,12 +1035,16 @@ void Speller::set_limiting_behaviour(int nbest, Weight maxweight, Weight beam) limiting = Nbest; } else if (maxweight < 0.0 && nbest == 0 && beam >= 0.0) { limiting = Beam; + } else { + return; } } void Speller::adjust_weight_limits(int nbest, Weight beam) { - if (limiting == Nbest && nbest_queue.size() >= nbest) { + if (limiting == MaxWeight) { + return; + } else if (limiting == Nbest && nbest_queue.size() >= nbest) { limit = nbest_queue.get_highest(); } else if (limiting == MaxWeightNbest && nbest_queue.size() >= nbest) { limit = std::min(limit, nbest_queue.get_lowest()); @@ -1201,7 +1209,7 @@ void Speller::add_symbol_to_alphabet_translator(SymbolNumber to_sym) } } // namespace hfst_ospell - + char* hfst_strndup(const char* s, size_t n) {