Skip to content
Snippets Groups Projects
Commit 47508373 authored by Apertis CI's avatar Apertis CI
Browse files

Import Upstream version 0.5.3

parent 17e5d61f
Branches debian/trixie
Tags debian/0.5.4-1
1 merge request!2Update from debian/bookworm for apertis/v2024dev2
Pipeline #861357 skipped
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
# autoconf requirements # autoconf requirements
AC_PREREQ([2.62]) AC_PREREQ([2.62])
AC_INIT([hfstospell], [0.5.2], [hfst-bugs@helsinki.fi], [hfstospell], [http://hfst.github.io]) AC_INIT([hfstospell], [0.5.3], [hfst-bugs@helsinki.fi], [hfstospell], [http://hfst.github.io])
LT_PREREQ([2.2.6]) LT_PREREQ([2.2.6])
...@@ -34,7 +34,7 @@ AC_CONFIG_HEADERS([config.h]) ...@@ -34,7 +34,7 @@ AC_CONFIG_HEADERS([config.h])
HFSTOSPELL_NAME=hfstospell HFSTOSPELL_NAME=hfstospell
HFSTOSPELL_MAJOR=0 HFSTOSPELL_MAJOR=0
HFSTOSPELL_MINOR=5 HFSTOSPELL_MINOR=5
HFSTOSPELL_EXTENSION=.2 HFSTOSPELL_EXTENSION=.3
HFSTOSPELL_VERSION=$HFSTOSPELL_MAJOR.$HFSTOSPELL_MINOR$HFSTOSPELL_EXTENSION HFSTOSPELL_VERSION=$HFSTOSPELL_MAJOR.$HFSTOSPELL_MINOR$HFSTOSPELL_EXTENSION
AC_SUBST(HFSTOSPELL_MAJOR) AC_SUBST(HFSTOSPELL_MAJOR)
AC_SUBST(HFSTOSPELL_MINOR) AC_SUBST(HFSTOSPELL_MINOR)
......
...@@ -177,8 +177,8 @@ void TransducerHeader::skip_hfst3_header(FILE * f) ...@@ -177,8 +177,8 @@ void TransducerHeader::skip_hfst3_header(FILE * f)
HFSTOSPELL_THROW_MESSAGE(HeaderParsingException, HFSTOSPELL_THROW_MESSAGE(HeaderParsingException,
"Found broken HFST3 header\n"); "Found broken HFST3 header\n");
} }
char * headervalue = new char[remaining_header_len]; std::string headervalue(remaining_header_len, '\0');
if (fread(headervalue, remaining_header_len, 1, f) != 1) if (fread(&headervalue[0], remaining_header_len, 1, f) != 1)
{ {
HFSTOSPELL_THROW_MESSAGE(HeaderParsingException, HFSTOSPELL_THROW_MESSAGE(HeaderParsingException,
"HFST3 header ended unexpectedly\n"); "HFST3 header ended unexpectedly\n");
...@@ -187,12 +187,10 @@ void TransducerHeader::skip_hfst3_header(FILE * f) ...@@ -187,12 +187,10 @@ void TransducerHeader::skip_hfst3_header(FILE * f)
HFSTOSPELL_THROW_MESSAGE(HeaderParsingException, HFSTOSPELL_THROW_MESSAGE(HeaderParsingException,
"Found broken HFST3 header\n"); "Found broken HFST3 header\n");
} }
std::string header_tail(headervalue, remaining_header_len); auto type_field = headervalue.find("type");
size_t type_field = header_tail.find("type");
if (type_field != std::string::npos) { if (type_field != std::string::npos) {
if (header_tail.find("HFST_OL") != type_field + 5 && if (headervalue.find("HFST_OL") != type_field + 5 &&
header_tail.find("HFST_OLW") != type_field + 5) { headervalue.find("HFST_OLW") != type_field + 5) {
delete[] headervalue;
HFSTOSPELL_THROW_MESSAGE( HFSTOSPELL_THROW_MESSAGE(
TransducerTypeException, TransducerTypeException,
"Transducer has incorrect type, should be " "Transducer has incorrect type, should be "
...@@ -809,7 +807,7 @@ void Encoder::read_input_symbol(const char * s, const int s_num) ...@@ -809,7 +807,7 @@ void Encoder::read_input_symbol(const char * s, const int s_num)
// If this is shadowed by an ascii symbol, unshadow // If this is shadowed by an ascii symbol, unshadow
ascii_symbols[(unsigned char)(*s)] = NO_SYMBOL; ascii_symbols[(unsigned char)(*s)] = NO_SYMBOL;
} }
letters.add_string(s, static_cast<SymbolNumber>(s_num)); letters.add_string(s, static_cast<SymbolNumber>(s_num));
} }
......
/* /*
Copyright 2009 University of Helsinki Copyright 2009 University of Helsinki
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
*/ */
/* /*
...@@ -165,7 +165,7 @@ void ...@@ -165,7 +165,7 @@ void
do_suggest(ZHfstOspeller& speller, const std::string& str) do_suggest(ZHfstOspeller& speller, const std::string& str)
{ {
hfst_ospell::CorrectionQueue corrections = speller.suggest(str); hfst_ospell::CorrectionQueue corrections = speller.suggest(str);
if (corrections.size() > 0) if (corrections.size() > 0)
{ {
hfst_fprintf(stdout, "Corrections for \"%s\":\n", str.c_str()); hfst_fprintf(stdout, "Corrections for \"%s\":\n", str.c_str());
while (corrections.size() > 0) while (corrections.size() > 0)
...@@ -181,7 +181,7 @@ do_suggest(ZHfstOspeller& speller, const std::string& str) ...@@ -181,7 +181,7 @@ do_suggest(ZHfstOspeller& speller, const std::string& str)
std::string::npos) std::string::npos)
{ {
hfst_fprintf(stdout, "%s %f %s " hfst_fprintf(stdout, "%s %f %s "
"[DISCARDED BY ANALYSES]\n", "[DISCARDED BY ANALYSES]\n",
corr.c_str(), corrections.top().second, corr.c_str(), corrections.top().second,
anals.top().first.c_str()); anals.top().first.c_str());
} }
...@@ -203,8 +203,8 @@ do_suggest(ZHfstOspeller& speller, const std::string& str) ...@@ -203,8 +203,8 @@ do_suggest(ZHfstOspeller& speller, const std::string& str)
} }
else else
{ {
hfst_fprintf(stdout, "%s %f\n", hfst_fprintf(stdout, "%s %f\n",
corr.c_str(), corr.c_str(),
corrections.top().second); corrections.top().second);
} }
corrections.pop(); corrections.pop();
...@@ -222,7 +222,7 @@ do_suggest(ZHfstOspeller& speller, const std::string& str) ...@@ -222,7 +222,7 @@ do_suggest(ZHfstOspeller& speller, const std::string& str)
void void
do_spell(ZHfstOspeller& speller, const std::string& str) do_spell(ZHfstOspeller& speller, const std::string& str)
{ {
if (speller.spell(str)) if (speller.spell(str))
{ {
hfst_fprintf(stdout, "\"%s\" is in the lexicon...\n", hfst_fprintf(stdout, "\"%s\" is in the lexicon...\n",
str.c_str()); str.c_str());
...@@ -251,7 +251,7 @@ do_spell(ZHfstOspeller& speller, const std::string& str) ...@@ -251,7 +251,7 @@ do_spell(ZHfstOspeller& speller, const std::string& str)
} }
if (all_no_spell) if (all_no_spell)
{ {
hfst_fprintf(stdout, hfst_fprintf(stdout,
"All spellings were invalidated by analysis! " "All spellings were invalidated by analysis! "
".:. Not in lexicon!\n"); ".:. Not in lexicon!\n");
} }
...@@ -281,43 +281,33 @@ zhfst_spell(char* zhfst_filename) ...@@ -281,43 +281,33 @@ zhfst_spell(char* zhfst_filename)
{ {
speller.read_zhfst(zhfst_filename); speller.read_zhfst(zhfst_filename);
} }
catch (hfst_ospell::ZHfstMetaDataParsingError zhmdpe) catch (hfst_ospell::ZHfstMetaDataParsingError& zhmdpe)
{ {
hfst_fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n", hfst_fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n",
zhfst_filename, zhmdpe.what()); zhfst_filename, zhmdpe.what());
//std::cerr << "cannot finish reading zhfst archive " << zhfst_filename <<
// ":\n" << zhmdpe.what() << "." << std::endl;
return EXIT_FAILURE; return EXIT_FAILURE;
} }
catch (hfst_ospell::ZHfstZipReadingError zhzre) catch (hfst_ospell::ZHfstZipReadingError& zhzre)
{ {
//std::cerr << "cannot read zhfst archive " << zhfst_filename << ":\n" hfst_fprintf(stderr,
// << zhzre.what() << "." << std::endl
// << "trying to read as legacy automata directory" << std::endl;
hfst_fprintf(stderr,
"cannot read zhfst archive %s:\n" "cannot read zhfst archive %s:\n"
"%s.\n", "%s.\n",
zhfst_filename, zhzre.what()); zhfst_filename, zhzre.what());
return EXIT_FAILURE; return EXIT_FAILURE;
} }
catch (hfst_ospell::ZHfstXmlParsingError zhxpe) catch (hfst_ospell::ZHfstXmlParsingError& zhxpe)
{ {
//std::cerr << "Cannot finish reading index.xml from " hfst_fprintf(stderr,
// << zhfst_filename << ":" << std::endl
// << zhxpe.what() << "." << std::endl;
hfst_fprintf(stderr,
"Cannot finish reading index.xml from %s:\n" "Cannot finish reading index.xml from %s:\n"
"%s.\n", "%s.\n",
zhfst_filename, zhxpe.what()); zhfst_filename, zhxpe.what());
return EXIT_FAILURE; return EXIT_FAILURE;
} }
if (verbose) if (verbose)
{ {
//std::cout << "Following metadata was read from ZHFST archive:" << std::endl hfst_fprintf(stdout,
// << speller.metadata_dump() << std::endl;
hfst_fprintf(stdout,
"Following metadata was read from ZHFST archive:\n" "Following metadata was read from ZHFST archive:\n"
"%s\n", "%s\n",
speller.metadata_dump().c_str()); speller.metadata_dump().c_str());
} }
speller.set_queue_limit(suggs); speller.set_queue_limit(suggs);
...@@ -354,7 +344,7 @@ zhfst_spell(char* zhfst_filename) ...@@ -354,7 +344,7 @@ zhfst_spell(char* zhfst_filename)
std::string linestr = wide_string_to_string(wstr); std::string linestr = wide_string_to_string(wstr);
free(str); free(str);
str = strdup(linestr.c_str()); str = strdup(linestr.c_str());
#else #else
while (!std::cin.eof()) { while (!std::cin.eof()) {
std::cin.getline(str, 2000); std::cin.getline(str, 2000);
#endif #endif
...@@ -398,7 +388,7 @@ int ...@@ -398,7 +388,7 @@ int
hfst_fprintf(stdout, "Not printing suggestions worse than best by margin %f\n", suggs); hfst_fprintf(stdout, "Not printing suggestions worse than best by margin %f\n", suggs);
} }
char * str = (char*) malloc(2000); char * str = (char*) malloc(2000);
#ifdef WINDOWS #ifdef WINDOWS
SetConsoleCP(65001); SetConsoleCP(65001);
const HANDLE stdIn = GetStdHandle(STD_INPUT_HANDLE); const HANDLE stdIn = GetStdHandle(STD_INPUT_HANDLE);
...@@ -410,7 +400,7 @@ int ...@@ -410,7 +400,7 @@ int
std::string linestr = wide_string_to_string(wstr); std::string linestr = wide_string_to_string(wstr);
free(str); free(str);
str = strdup(linestr.c_str()); str = strdup(linestr.c_str());
#else #else
while (!std::cin.eof()) { while (!std::cin.eof()) {
std::cin.getline(str, 2000); std::cin.getline(str, 2000);
#endif #endif
...@@ -435,11 +425,11 @@ int ...@@ -435,11 +425,11 @@ int
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
#if HAVE_GETOPT_H
int c; int c;
//std::locale::global(std::locale("")); //std::locale::global(std::locale(""));
#if HAVE_GETOPT_H
while (true) { while (true) {
static struct option long_options[] = static struct option long_options[] =
{ {
...@@ -463,7 +453,7 @@ int main(int argc, char **argv) ...@@ -463,7 +453,7 @@ int main(int argc, char **argv)
#endif #endif
{0, 0, 0, 0 } {0, 0, 0, 0 }
}; };
int option_index = 0; int option_index = 0;
c = getopt_long(argc, argv, "hVvqsan:w:b:t:SXm:l:k", long_options, &option_index); c = getopt_long(argc, argv, "hVvqsan:w:b:t:SXm:l:k", long_options, &option_index);
char* endptr = 0; char* endptr = 0;
...@@ -476,17 +466,17 @@ int main(int argc, char **argv) ...@@ -476,17 +466,17 @@ int main(int argc, char **argv)
print_usage(); print_usage();
return EXIT_SUCCESS; return EXIT_SUCCESS;
break; break;
case 'V': case 'V':
print_version(); print_version();
return EXIT_SUCCESS; return EXIT_SUCCESS;
break; break;
case 'v': case 'v':
verbose = true; verbose = true;
quiet = false; quiet = false;
break; break;
case 'q': // fallthrough case 'q': // fallthrough
case 's': case 's':
quiet = true; quiet = true;
...@@ -550,7 +540,7 @@ int main(int argc, char **argv) ...@@ -550,7 +540,7 @@ int main(int argc, char **argv)
case 'k': case 'k':
output_to_console = true; output_to_console = true;
break; break;
#endif #endif
case 'S': case 'S':
suggest = true; suggest = true;
break; break;
......
...@@ -21,16 +21,16 @@ ...@@ -21,16 +21,16 @@
*/ */
/* /*
Tests up to 16 variations of each input token: Tests up to 8 variations of each input token:
- Verbatim - Verbatim
- With leading non-alphanumerics removed - With leading non-alphanumerics removed
- With trailing non-alphanumerics removed - With trailing non-alphanumerics removed
- With leading and trailing non-alphanumerics removed - With leading and trailing non-alphanumerics removed
- Lower-case of all the above - First-lower of all the above
- First-upper of all the above
*/ */
#include <iostream> #include <iostream>
#include <iomanip>
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include <string> #include <string>
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
#include <cmath> #include <cmath>
#include <cerrno> #include <cerrno>
#include <cctype> #include <cctype>
#include <getopt.h>
#define U_CHARSET_IS_UTF8 1 #define U_CHARSET_IS_UTF8 1
#include <unicode/uclean.h> #include <unicode/uclean.h>
...@@ -64,13 +65,18 @@ struct word_t { ...@@ -64,13 +65,18 @@ struct word_t {
UnicodeString buffer; UnicodeString buffer;
}; };
std::vector<word_t> words(16); std::vector<word_t> words(16);
std::string buffer; std::string buffer, wbuf;
std::vector<std::string> alts; using Alt = std::pair<double,std::string>;
std::vector<Alt> alts;
std::unordered_set<std::string> outputs; std::unordered_set<std::string> outputs;
UnicodeString ubuffer, uc_buffer; UnicodeString ubuffer, uc_buffer;
size_t cw; size_t cw;
bool verbatim = false; bool verbatim = false;
bool debug = false;
hfst_ospell::Weight max_weight = -1.0;
hfst_ospell::Weight beam = -1.0;
float time_cutoff = 6.0;
bool uc_first = false; bool uc_first = false;
bool uc_all = true; bool uc_all = true;
...@@ -82,14 +88,18 @@ bool find_alternatives(ZHfstOspeller& speller, size_t suggs) { ...@@ -82,14 +88,18 @@ bool find_alternatives(ZHfstOspeller& speller, size_t suggs) {
for (size_t k=0 ; k < cw && alts.size()<suggs ; ++k) { for (size_t k=0 ; k < cw && alts.size()<suggs ; ++k) {
buffer.clear(); buffer.clear();
words[k].buffer.toUTF8String(buffer); words[k].buffer.toUTF8String(buffer);
hfst_ospell::CorrectionQueue corrections = speller.suggest(buffer); auto corrections = speller.suggest(buffer);
if (corrections.size() == 0) { if (corrections.size() == 0) {
continue; continue;
} }
// Because speller.set_queue_limit() doesn't actually work, hard limit it here for (size_t i=0, e=corrections.size() ; i<e ; ++i) {
for (size_t i=0, e=corrections.size() ; i<e && alts.size()<suggs ; ++i) { // Work around https://github.com/hfst/hfst-ospell/issues/54
if (max_weight > 0.0 && corrections.top().second > max_weight) {
break;
}
auto w = corrections.top().second * (1.0 + k/10.0);
buffer.clear(); buffer.clear();
if (k != 0) { if (k != 0) {
...@@ -112,8 +122,18 @@ bool find_alternatives(ZHfstOspeller& speller, size_t suggs) { ...@@ -112,8 +122,18 @@ bool find_alternatives(ZHfstOspeller& speller, size_t suggs) {
words[0].buffer.tempSubString(words[k].start + words[k].count).toUTF8String(buffer); words[0].buffer.tempSubString(words[k].start + words[k].count).toUTF8String(buffer);
} }
if (debug) {
wbuf.resize(64);
wbuf.resize(sprintf(&wbuf[0], " (%.2f;%zu)", corrections.top().second, k));
buffer += wbuf;
}
if (outputs.count(buffer) == 0) { if (outputs.count(buffer) == 0) {
alts.push_back(buffer); alts.push_back({w, buffer});
std::sort(alts.begin(), alts.end());
while (alts.size() > suggs) {
alts.pop_back();
}
} }
outputs.insert(buffer); outputs.insert(buffer);
corrections.pop(); corrections.pop();
...@@ -123,7 +143,7 @@ bool find_alternatives(ZHfstOspeller& speller, size_t suggs) { ...@@ -123,7 +143,7 @@ bool find_alternatives(ZHfstOspeller& speller, size_t suggs) {
if (!alts.empty()) { if (!alts.empty()) {
std::cout << "&"; std::cout << "&";
for (auto& alt : alts) { for (auto& alt : alts) {
std::cout << "\t" << alt; std::cout << "\t" << alt.second;
} }
std::cout << std::endl; std::cout << std::endl;
return true; return true;
...@@ -167,7 +187,7 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs ...@@ -167,7 +187,7 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs
} }
size_t ichStart = 0, cchUse = ubuffer.length(); size_t ichStart = 0, cchUse = ubuffer.length();
const UChar *pwsz = ubuffer.getTerminatedBuffer(); auto pwsz = ubuffer.getTerminatedBuffer();
// Always test the full given input // Always test the full given input
words[0].buffer.remove(); words[0].buffer.remove();
...@@ -216,7 +236,7 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs ...@@ -216,7 +236,7 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs
for (size_t i=0, e=cw ; i<e ; ++i) { for (size_t i=0, e=cw ; i<e ; ++i) {
// If we are looking for suggestions, don't use the cache // If we are looking for suggestions, don't use the cache
valid_words_t::iterator it = suggs ? valid_words.end() : valid_words.find(words[i].buffer); auto it = suggs ? valid_words.end() : valid_words.find(words[i].buffer);
if (it == valid_words.end()) { if (it == valid_words.end()) {
buffer.clear(); buffer.clear();
...@@ -224,49 +244,21 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs ...@@ -224,49 +244,21 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs
bool valid = speller.spell(buffer); bool valid = speller.spell(buffer);
it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first; it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first;
if (!valid && !verbatim) { if (!valid && !verbatim && uc_first) {
// If the word was not valid, fold it to lower case and try again // If the word was not valid, try a first-lower variant
buffer.clear();
ubuffer = words[i].buffer;
ubuffer.toLower();
ubuffer.toUTF8String(buffer);
// Add the lower case variant to the list so that we get suggestions using that, if need be
words[cw].start = words[i].start;
words[cw].count = words[i].count;
words[cw].buffer = ubuffer;
++cw;
// Don't try again if the lower cased variant has already been tried
valid_words_t::iterator itl = suggs ? valid_words.end() : valid_words.find(ubuffer);
if (itl != valid_words.end()) {
it->second = itl->second;
it = itl;
}
else {
valid = speller.spell(buffer);
it->second = valid; // Also mark the original mixed case variant as whatever the lower cased one was
it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first;
}
}
if (!valid && !verbatim && (uc_all || uc_first)) {
// If the word was still not valid but had upper case, try a first-upper variant
buffer.clear(); buffer.clear();
ubuffer.setTo(words[i].buffer, 0, 1); ubuffer.setTo(words[i].buffer, 0, 1);
ubuffer.toUpper(); ubuffer.toLower();
uc_buffer.setTo(words[i].buffer, 1); ubuffer.append(words[i].buffer, 1, words[i].buffer.length() - 1);
uc_buffer.toLower();
ubuffer.append(uc_buffer);
ubuffer.toUTF8String(buffer); ubuffer.toUTF8String(buffer);
// Add the first-upper variant to the list so that we get suggestions using that, if need be // Add the first-lower case variant to the list so that we get suggestions using that, if need be
words[cw].start = words[i].start; words[cw].start = words[i].start;
words[cw].count = words[i].count; words[cw].count = words[i].count;
words[cw].buffer = ubuffer; words[cw].buffer = ubuffer;
++cw; ++cw;
// Don't try again if the first-upper variant has already been tried // Don't try again if the first-lower variant has already been tried
valid_words_t::iterator itl = suggs ? valid_words.end() : valid_words.find(ubuffer); valid_words_t::iterator itl = suggs ? valid_words.end() : valid_words.find(ubuffer);
if (itl != valid_words.end()) { if (itl != valid_words.end()) {
it->second = itl->second; it->second = itl->second;
...@@ -274,7 +266,7 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs ...@@ -274,7 +266,7 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs
} }
else { else {
valid = speller.spell(buffer); valid = speller.spell(buffer);
it->second = valid; // Also mark the original mixed case variant as whatever the first-upper one was it->second = valid; // Also mark the original mixed case variant as whatever the first-lower one was
it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first; it = valid_words.insert(std::make_pair(words[i].buffer,valid)).first;
} }
} }
...@@ -291,8 +283,13 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs ...@@ -291,8 +283,13 @@ bool is_valid_word(ZHfstOspeller& speller, const std::string& word, size_t suggs
int zhfst_spell(const char* zhfst_filename) { int zhfst_spell(const char* zhfst_filename) {
ZHfstOspeller speller; ZHfstOspeller speller;
try { try {
if (debug) {
std::cout << "@@ Loading " << zhfst_filename << " with args max-weight=" << max_weight << ", beam=" << beam << ", time-cutoff=" << time_cutoff << std::endl;
}
speller.read_zhfst(zhfst_filename); speller.read_zhfst(zhfst_filename);
speller.set_time_cutoff(6.0); speller.set_weight_limit(max_weight);
speller.set_beam(beam);
speller.set_time_cutoff(time_cutoff);
} }
catch (hfst_ospell::ZHfstMetaDataParsingError zhmdpe) { catch (hfst_ospell::ZHfstMetaDataParsingError zhmdpe) {
fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n", zhfst_filename, zhmdpe.what()); fprintf(stderr, "cannot finish reading zhfst archive %s:\n%s.\n", zhfst_filename, zhmdpe.what());
...@@ -319,6 +316,38 @@ int zhfst_spell(const char* zhfst_filename) { ...@@ -319,6 +316,38 @@ int zhfst_spell(const char* zhfst_filename) {
if (line.empty()) { if (line.empty()) {
continue; continue;
} }
if (line.size() >= 5 && line[0] == '$' && line[1] == '$' && line[3] == ' ') {
if (line[2] == 'd' && isdigit(line[4]) && line.size() == 5) {
debug = (line[4] != '0');
std::cout << "@@ Option debug changed to " << debug << std::endl;
continue;
}
if (line[2] == 'T' && isdigit(line[4]) && line.size() == 5) {
verbatim = (line[4] != '0');
std::cout << "@@ Option verbatim changed to " << verbatim << std::endl;
continue;
}
if (line[2] == 'w' && isdigit(line[4])) {
max_weight = std::stof(&line[4]);
speller.set_weight_limit(max_weight);
std::cout << "@@ Option max-weight changed to " << max_weight << std::endl;
continue;
}
if (line[2] == 'b' && isdigit(line[4])) {
beam = std::stof(&line[4]);
speller.set_beam(beam);
std::cout << "@@ Option beam changed to " << beam << std::endl;
continue;
}
if (line[2] == 't' && isdigit(line[4])) {
time_cutoff = std::stof(&line[4]);
speller.set_time_cutoff(time_cutoff);
std::cout << "@@ Option time-cutoff changed to " << time_cutoff << std::endl;
continue;
}
}
// Just in case anyone decides to use the speller for a minor eternity // Just in case anyone decides to use the speller for a minor eternity
if (valid_words.size() > 20480) { if (valid_words.size() > 20480) {
valid_words.clear(); valid_words.clear();
...@@ -345,6 +374,19 @@ int zhfst_spell(const char* zhfst_filename) { ...@@ -345,6 +374,19 @@ int zhfst_spell(const char* zhfst_filename) {
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
void print_help() {
std::cout
<< "Usage: hfst-ospell [options] zhfst-archive\n"
<< "\n"
<< " -h, --help Shows this help\n"
<< " -d, --debug Debug output with weights attached to results\n"
<< " -T, --verbatim Disables case-folding and non-alphanumeric trimming\n"
<< " -w, --max-weight=W Suppress corrections with weights above W\n"
<< " -b, --beam=W Suppress corrections worse than best candidate by more than W\n"
<< " -t, --time-cutoff=T Stop trying to find better corrections after T seconds; defaults to 6.0\n"
<< std::flush;
}
int main(int argc, char **argv) { int main(int argc, char **argv) {
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
u_init(&status); u_init(&status);
...@@ -356,22 +398,60 @@ int main(int argc, char **argv) { ...@@ -356,22 +398,60 @@ int main(int argc, char **argv) {
ucnv_setDefaultName("UTF-8"); ucnv_setDefaultName("UTF-8");
uloc_setDefault("en_US_POSIX", &status); uloc_setDefault("en_US_POSIX", &status);
std::vector<std::string> args(argv, argv+argc); struct option long_options[] =
for (std::vector<std::string>::iterator it=args.begin() ; it != args.end() ; ) { {
if (*it == "--verbatim") { {"help", no_argument, 0, 'h'},
verbatim = true; {"debug", no_argument, 0, 'd'},
it = args.erase(it); {"verbatim", no_argument, 0, 'T'},
{"max-weight", required_argument, 0, 'w'},
{"beam", required_argument, 0, 'b'},
{"time-cutoff", required_argument, 0, 't'},
{0, 0, 0, 0 }
};
int c = 0;
while (true) {
int option_index = 0;
c = getopt_long(argc, argv, "hdTw:b:t:", long_options, &option_index);
if (c == -1) {
break;
} }
else {
++it; switch (c) {
case 'h':
print_help();
return EXIT_SUCCESS;
case 'd':
debug = true;
break;
case 'T':
verbatim = true;
break;
case 'w':
max_weight = std::stof(optarg);
break;
case 'b':
beam = std::stof(optarg);
break;
case 't':
time_cutoff = std::stof(optarg);
break;
} }
} }
if (args.size() < 2) { if (optind >= argc) {
throw std::invalid_argument("Must pass a zhfst as argument"); throw std::invalid_argument("Must pass a zhfst as argument");
} }
int rv = zhfst_spell(args[1].c_str()); std::cerr << std::fixed << std::setprecision(2);
std::cout << std::fixed << std::setprecision(2);
int rv = zhfst_spell(argv[optind]);
u_cleanup(); u_cleanup();
return rv; return rv;
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "hfstol-stdafx.h" #include "hfstol-stdafx.h"
#include <string> #include <string>
#include <sstream> #include <sstream>
#include <cstring>
namespace hfst_ospell namespace hfst_ospell
{ {
...@@ -21,7 +22,7 @@ struct OspellException ...@@ -21,7 +22,7 @@ struct OspellException
size_t line; //!< line number of exception size_t line; //!< line number of exception
OspellException(void) {} OspellException(void) {}
//! //!
//! construct exception with name, file and location //! construct exception with name, file and location
OspellException(const std::string &name,const std::string &file,size_t line): OspellException(const std::string &name,const std::string &file,size_t line):
...@@ -29,7 +30,7 @@ struct OspellException ...@@ -29,7 +30,7 @@ struct OspellException
file(file), file(file),
line(line) line(line)
{} {}
//! //!
//! create string representation of exception for output //! create string representation of exception for output
std::string operator() (void) const std::string operator() (void) const
...@@ -45,7 +46,7 @@ struct OspellException ...@@ -45,7 +46,7 @@ struct OspellException
{ {
std::ostringstream o; std::ostringstream o;
o << file << ":" << line << ":" << name; o << file << ":" << line << ":" << name;
return o.str().c_str(); return strdup(o.str().c_str());
} }
}; };
...@@ -59,7 +60,7 @@ struct OspellException ...@@ -59,7 +60,7 @@ struct OspellException
#define HFSTOSPELL_EXCEPTION_CHILD_DECLARATION(CHILD) \ #define HFSTOSPELL_EXCEPTION_CHILD_DECLARATION(CHILD) \
struct CHILD : public OspellException \ struct CHILD : public OspellException \
{ CHILD(const std::string &name,const std::string &file,size_t line):\ { CHILD(const std::string &name,const std::string &file,size_t line):\
OspellException(name,file,line) {}} OspellException(name,file,line) {}}
#define HFST_CATCH(E) \ #define HFST_CATCH(E) \
catch (const E &e) \ catch (const E &e) \
......
...@@ -152,31 +152,31 @@ TreeNode TreeNode::update(SymbolNumber symbol, ...@@ -152,31 +152,31 @@ TreeNode TreeNode::update(SymbolNumber symbol,
bool TreeNode::try_compatible_with(FlagDiacriticOperation op) bool TreeNode::try_compatible_with(FlagDiacriticOperation op)
{ {
switch (op.Operation()) { switch (op.Operation()) {
case P: // positive set case P: // positive set
flag_state[op.Feature()] = op.Value(); flag_state[op.Feature()] = op.Value();
return true; return true;
case N: // negative set (literally, in this implementation) case N: // negative set (literally, in this implementation)
flag_state[op.Feature()] = -1*op.Value(); flag_state[op.Feature()] = -1*op.Value();
return true; return true;
case R: // require case R: // require
if (op.Value() == 0) { // "plain" require, return false if unset if (op.Value() == 0) { // "plain" require, return false if unset
return (flag_state[op.Feature()] != 0); return (flag_state[op.Feature()] != 0);
} }
return (flag_state[op.Feature()] == op.Value()); return (flag_state[op.Feature()] == op.Value());
case D: // disallow case D: // disallow
if (op.Value() == 0) { // "plain" disallow, return true if unset if (op.Value() == 0) { // "plain" disallow, return true if unset
return (flag_state[op.Feature()] == 0); return (flag_state[op.Feature()] == 0);
} }
return (flag_state[op.Feature()] != op.Value()); return (flag_state[op.Feature()] != op.Value());
case C: // clear case C: // clear
flag_state[op.Feature()] = 0; flag_state[op.Feature()] = 0;
return true; return true;
case U: // unification case U: // unification
/* if the feature is unset OR the feature is to this value already OR /* if the feature is unset OR the feature is to this value already OR
the feature is negatively set to something else than this value */ the feature is negatively set to something else than this value */
...@@ -190,7 +190,7 @@ bool TreeNode::try_compatible_with(FlagDiacriticOperation op) ...@@ -190,7 +190,7 @@ bool TreeNode::try_compatible_with(FlagDiacriticOperation op)
} }
return false; return false;
} }
return false; // to make the compiler happy return false; // to make the compiler happy
} }
...@@ -204,7 +204,11 @@ Speller::Speller(Transducer* mutator_ptr, Transducer* lexicon_ptr): ...@@ -204,7 +204,11 @@ Speller::Speller(Transducer* mutator_ptr, Transducer* lexicon_ptr):
alphabet_translator(SymbolVector()), alphabet_translator(SymbolVector()),
operations(lexicon->get_operations()), operations(lexicon->get_operations()),
limiting(None), limiting(None),
mode(Correct) mode(Correct),
max_time(-1.0),
start_clock(0),
call_counter(0),
limit_reached(false)
{ {
if (mutator != NULL) { if (mutator != NULL) {
build_alphabet_translator(); build_alphabet_translator();
...@@ -228,7 +232,7 @@ void Speller::lexicon_epsilons(void) ...@@ -228,7 +232,7 @@ void Speller::lexicon_epsilons(void)
} }
TransitionTableIndex next = lexicon->next(next_node.lexicon_state, 0); TransitionTableIndex next = lexicon->next(next_node.lexicon_state, 0);
STransition i_s = lexicon->take_epsilons_and_flags(next); STransition i_s = lexicon->take_epsilons_and_flags(next);
while (i_s.symbol != NO_SYMBOL) { while (i_s.symbol != NO_SYMBOL) {
if (is_under_weight_limit(next_node.weight + i_s.weight)) { if (is_under_weight_limit(next_node.weight + i_s.weight)) {
if (lexicon->transitions.input_symbol(next) == 0) { if (lexicon->transitions.input_symbol(next) == 0) {
...@@ -326,7 +330,7 @@ void Speller::mutator_epsilons(void) ...@@ -326,7 +330,7 @@ void Speller::mutator_epsilons(void)
} }
TransitionTableIndex next_m = mutator->next(next_node.mutator_state, 0); TransitionTableIndex next_m = mutator->next(next_node.mutator_state, 0);
STransition mutator_i_s = mutator->take_epsilons(next_m); STransition mutator_i_s = mutator->take_epsilons(next_m);
while (mutator_i_s.symbol != NO_SYMBOL) { while (mutator_i_s.symbol != NO_SYMBOL) {
if (mutator_i_s.symbol == 0) { if (mutator_i_s.symbol == 0) {
if (is_under_weight_limit( if (is_under_weight_limit(
...@@ -460,12 +464,9 @@ bool Transducer::initialize_input_vector(SymbolVector & input_vector, ...@@ -460,12 +464,9 @@ bool Transducer::initialize_input_vector(SymbolVector & input_vector,
char * line) char * line)
{ {
input_vector.clear(); input_vector.clear();
SymbolNumber k = NO_SYMBOL;
char ** inpointer = &line; char ** inpointer = &line;
char * oldpointer;
while (**inpointer != '\0') { while (**inpointer != '\0') {
oldpointer = *inpointer; SymbolNumber k = encoder->find_key(inpointer);
k = encoder->find_key(inpointer);
if (k == NO_SYMBOL) { // no tokenization from alphabet if (k == NO_SYMBOL) { // no tokenization from alphabet
// for real handling of other and identity for unseen symbols, // for real handling of other and identity for unseen symbols,
// use the Speller interface analyse()! // use the Speller interface analyse()!
...@@ -532,18 +533,18 @@ AnalysisQueue Transducer::lookup(char * line) ...@@ -532,18 +533,18 @@ AnalysisQueue Transducer::lookup(char * line)
i_s = take_epsilons_and_flags(next_index); i_s = take_epsilons_and_flags(next_index);
} }
} }
// input consumption loop // input consumption loop
unsigned int input_state = next_node.input_state; unsigned int input_state = next_node.input_state;
if (input_state < input.size() && if (input_state < input.size() &&
has_transitions( has_transitions(
next_node.lexicon_state + 1, input[input_state])) { next_node.lexicon_state + 1, input[input_state])) {
next_index = next(next_node.lexicon_state, next_index = next(next_node.lexicon_state,
input[input_state]); input[input_state]);
STransition i_s = take_non_epsilons(next_index, STransition i_s = take_non_epsilons(next_index,
input[input_state]); input[input_state]);
while (i_s.symbol != NO_SYMBOL) { while (i_s.symbol != NO_SYMBOL) {
queue.push_back(next_node.update( queue.push_back(next_node.update(
i_s.symbol, i_s.symbol,
...@@ -551,18 +552,18 @@ AnalysisQueue Transducer::lookup(char * line) ...@@ -551,18 +552,18 @@ AnalysisQueue Transducer::lookup(char * line)
next_node.mutator_state, next_node.mutator_state,
i_s.index, i_s.index,
i_s.weight)); i_s.weight));
++next_index; ++next_index;
i_s = take_non_epsilons(next_index, input[input_state]); i_s = take_non_epsilons(next_index, input[input_state]);
} }
} }
} }
for (auto& it : outputs) { for (auto& it : outputs) {
analyses.push(StringWeightPair(it.first, it.second)); analyses.push(StringWeightPair(it.first, it.second));
} }
return analyses; return analyses;
} }
...@@ -729,7 +730,7 @@ Weight Transducer::final_weight(const TransitionTableIndex i) const ...@@ -729,7 +730,7 @@ Weight Transducer::final_weight(const TransitionTableIndex i) const
bool bool
Transducer::is_flag(const SymbolNumber symbol) Transducer::is_flag(const SymbolNumber symbol)
{ {
return alphabet.is_flag(symbol); return alphabet.is_flag(symbol);
} }
bool bool
...@@ -888,7 +889,7 @@ CorrectionQueue Speller::correct(char * line, int nbest, ...@@ -888,7 +889,7 @@ CorrectionQueue Speller::correct(char * line, int nbest,
std::map<std::string, Weight> corrections; std::map<std::string, Weight> corrections;
SymbolNumber first_input = (input.size() == 0) ? 0 : input[0]; SymbolNumber first_input = (input.size() == 0) ? 0 : input[0];
if (cache[first_input].empty) { if (cache[first_input].empty) {
build_cache(first_input); build_cache(first_input); // XXX: cache corrupts limit!
} }
if (input.size() <= 1) { if (input.size() <= 1) {
// get the cached results and we're done // get the cached results and we're done
...@@ -908,6 +909,7 @@ CorrectionQueue Speller::correct(char * line, int nbest, ...@@ -908,6 +909,7 @@ CorrectionQueue Speller::correct(char * line, int nbest,
} }
} }
} }
set_limiting_behaviour(nbest, maxweight, beam);
adjust_weight_limits(nbest, beam); adjust_weight_limits(nbest, beam);
for(auto& it : *results) { for(auto& it : *results) {
// Then collect the results // Then collect the results
...@@ -946,6 +948,7 @@ CorrectionQueue Speller::correct(char * line, int nbest, ...@@ -946,6 +948,7 @@ CorrectionQueue Speller::correct(char * line, int nbest,
*/ */
next_node = queue.back(); next_node = queue.back();
queue.pop_back(); queue.pop_back();
set_limiting_behaviour(nbest, maxweight, beam); // XXX: need to reset
adjust_weight_limits(nbest, beam); adjust_weight_limits(nbest, beam);
// if we can't get an acceptable result, never mind // if we can't get an acceptable result, never mind
if (next_node.weight > limit) { if (next_node.weight > limit) {
...@@ -1005,6 +1008,7 @@ CorrectionQueue Speller::correct(char * line, int nbest, ...@@ -1005,6 +1008,7 @@ CorrectionQueue Speller::correct(char * line, int nbest,
} }
} }
} }
//cache[first_input].clear();
return correction_queue; return correction_queue;
} }
...@@ -1031,12 +1035,16 @@ void Speller::set_limiting_behaviour(int nbest, Weight maxweight, Weight beam) ...@@ -1031,12 +1035,16 @@ void Speller::set_limiting_behaviour(int nbest, Weight maxweight, Weight beam)
limiting = Nbest; limiting = Nbest;
} else if (maxweight < 0.0 && nbest == 0 && beam >= 0.0) { } else if (maxweight < 0.0 && nbest == 0 && beam >= 0.0) {
limiting = Beam; limiting = Beam;
} else {
return;
} }
} }
void Speller::adjust_weight_limits(int nbest, Weight beam) void Speller::adjust_weight_limits(int nbest, Weight beam)
{ {
if (limiting == Nbest && nbest_queue.size() >= nbest) { if (limiting == MaxWeight) {
return;
} else if (limiting == Nbest && nbest_queue.size() >= nbest) {
limit = nbest_queue.get_highest(); limit = nbest_queue.get_highest();
} else if (limiting == MaxWeightNbest && nbest_queue.size() >= nbest) { } else if (limiting == MaxWeightNbest && nbest_queue.size() >= nbest) {
limit = std::min(limit, nbest_queue.get_lowest()); limit = std::min(limit, nbest_queue.get_lowest());
...@@ -1201,7 +1209,7 @@ void Speller::add_symbol_to_alphabet_translator(SymbolNumber to_sym) ...@@ -1201,7 +1209,7 @@ void Speller::add_symbol_to_alphabet_translator(SymbolNumber to_sym)
} }
} // namespace hfst_ospell } // namespace hfst_ospell
char* char*
hfst_strndup(const char* s, size_t n) hfst_strndup(const char* s, size_t n)
{ {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment