fix spell checking issues using recent Hunspell patches

Test: English word "Ian" are "item" are not allowed as "İan", "İtem" now.

Patch list with commit ids in Hunspell repository:

commit 66badb7449c2053c89456f11a7f71f3f5916b550
Extend dotless i and dotted I rules to Crimean Tatar language

commit 88cf975c295e3ec808efb77bb1a2a031d77f0c89
Allow dotted I in dictionary, and disable bad capitalization

commit 39b785a6b03b35cc8a27f43f6005dcaa432694e1
FORBIDDENWORD precedes BREAK

commit 0f691abe68788d0a58e72ab66877a9f670cd2741
Remove forbidden words from dash suggestion list

commit 15b2cde4f01706f0a648518a5cfc57394d015448
tdf#95024 fix compound handling for new Hungarian orthography

commit de3ae6844af62300e473f7b7b66a56e54153b4b9
fix compound word part "pa:"

Change-Id: Id12b5629b0c975464072b5b144743cbe40fe45a3
Reviewed-on: https://gerrit.libreoffice.org/44200
Tested-by: Jenkins <ci@libreoffice.org>
Reviewed-by: Andras Timar <andras.timar@collabora.com>
diff --git a/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch b/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch
new file mode 100644
index 0000000..b4b0438
--- /dev/null
+++ b/external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch
@@ -0,0 +1,55 @@
From 88cf975c295e3ec808efb77bb1a2a031d77f0c89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?=
 <laszlo.nemeth@collabora.com>
Date: Thu, 5 Oct 2017 12:24:02 +0200
Subject: [PATCH] Allow dotted I in dictionary, and disable bad capitalization
 of i.

Dictionary words weren't recognized with dotted I, but dictionary
words with the letter i were recognized with dotted I, too.
---
 src/hunspell/hunspell.cxx | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
index 1ef11df..5c98f8a 100644
--- a/src/hunspell/hunspell.cxx
+++ b/src/hunspell/hunspell.cxx
@@ -562,11 +562,15 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
       }
     }
     case INITCAP: {
-
+      // handle special capitalization of dotted I
+      bool Idot = (utf8 && (unsigned char) scw[0] == 0xc4 && (unsigned char) scw[1] == 0xb0);
       *info += SPELL_ORIGCAP;
-      mkallsmall2(scw, sunicw);
-      std::string u8buffer(scw);
-      mkinitcap2(scw, sunicw);
+      if (captype == ALLCAP) {
+          mkallsmall2(scw, sunicw);
+          mkinitcap2(scw, sunicw);
+          if (Idot)
+             scw.replace(0, 1, "\xc4\xb0");
+      }
       if (captype == INITCAP)
         *info += SPELL_INITCAP;
       rv = checkword(scw, info, root);
@@ -581,9 +585,13 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
       }
       if (rv && is_keepcase(rv) && (captype == ALLCAP))
         rv = NULL;
-      if (rv)
+      if (rv || (Idot && langnum != LANG_az && langnum != LANG_tr && langnum != LANG_crh))
         break;
 
+      mkallsmall2(scw, sunicw);
+      std::string u8buffer(scw);
+      mkinitcap2(scw, sunicw);
+
       rv = checkword(u8buffer, info, root);
       if (abbv && !rv) {
         u8buffer.push_back('.');
-- 
1.9.1

diff --git a/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch b/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch
new file mode 100644
index 0000000..66cc781
--- /dev/null
+++ b/external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch
@@ -0,0 +1,66 @@
From 66badb7449c2053c89456f11a7f71f3f5916b550 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?=
 <laszlo.nemeth@collabora.com>
Date: Thu, 5 Oct 2017 11:13:28 +0200
Subject: [PATCH] Extend dotless i and dotted I rules to Crimean Tatar language
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

to support its special casing of ı/I, i/İ.

(Use

LANG crh

in the affix file to use this feature.)
---
 src/hunspell/csutil.cxx  | 5 +++--
 src/hunspell/langnum.hxx | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/hunspell/csutil.cxx b/src/hunspell/csutil.cxx
index df97b57..2980da7 100644
--- a/src/hunspell/csutil.cxx
+++ b/src/hunspell/csutil.cxx
@@ -2401,6 +2401,7 @@ static struct lang_map lang2enc[] =
     {{"ar", LANG_ar},    {"az", LANG_az},
      {"az_AZ", LANG_az},  // for back-compatibility
      {"bg", LANG_bg},    {"ca", LANG_ca},
+     {"crh", LANG_crh},
      {"cs", LANG_cs},    {"da", LANG_da},
      {"de", LANG_de},    {"el", LANG_el},
      {"en", LANG_en},    {"es", LANG_es},
@@ -2458,7 +2459,7 @@ unsigned short unicodetoupper(unsigned short c, int langnum) {
   // In Azeri and Turkish, I and i dictinct letters:
   // There are a dotless lower case i pair of upper `I',
   // and an upper I with dot pair of lower `i'.
-  if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr)))
+  if (c == 0x0069 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh)))
     return 0x0130;
 #ifdef OPENOFFICEORG
   return static_cast<unsigned short>(u_toupper(c));
@@ -2475,7 +2476,7 @@ unsigned short unicodetolower(unsigned short c, int langnum) {
   // In Azeri and Turkish, I and i dictinct letters:
   // There are a dotless lower case i pair of upper `I',
   // and an upper I with dot pair of lower `i'.
-  if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr)))
+  if (c == 0x0049 && ((langnum == LANG_az) || (langnum == LANG_tr) || (langnum == LANG_crh)))
     return 0x0131;
 #ifdef OPENOFFICEORG
   return static_cast<unsigned short>(u_tolower(c));
diff --git a/src/hunspell/langnum.hxx b/src/hunspell/langnum.hxx
index a64d3d7..f09de40 100644
--- a/src/hunspell/langnum.hxx
+++ b/src/hunspell/langnum.hxx
@@ -48,6 +48,7 @@ enum {
   LANG_az = 100,  // custom number
   LANG_bg = 41,
   LANG_ca = 37,
+  LANG_crh = 102, // custom number
   LANG_cs = 42,
   LANG_da = 45,
   LANG_de = 49,
-- 
1.9.1

diff --git a/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch b/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch
new file mode 100644
index 0000000..6cad45d
--- /dev/null
+++ b/external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch
@@ -0,0 +1,27 @@
From 39b785a6b03b35cc8a27f43f6005dcaa432694e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
Date: Mon, 9 Oct 2017 13:02:39 +0200
Subject: [PATCH] FORBIDDENWORD precedes BREAK

Now it's possible to forbid compound forms recognized by
BREAK word breaking.
---
 src/hunspell/hunspell.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
index 5c98f8a..3fd0d16 100644
--- a/src/hunspell/hunspell.cxx
+++ b/src/hunspell/hunspell.cxx
@@ -633,7 +633,7 @@ bool HunspellImpl::spell(const std::string& word, int* info, std::string* root)
   }
 
   // recursive breaking at break points
-  if (!wordbreak.empty()) {
+  if (!wordbreak.empty() && !(*info & SPELL_FORBIDDEN)) {
 
     int nbr = 0;
     wl = scw.size();
-- 
1.9.1

diff --git a/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch b/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch
new file mode 100644
index 0000000..b0f8563
--- /dev/null
+++ b/external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch
@@ -0,0 +1,29 @@
From 0f691abe68788d0a58e72ab66877a9f670cd2741 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
Date: Tue, 10 Oct 2017 11:58:43 +0200
Subject: [PATCH] Remove forbidden words from dash suggestion list

---
 src/hunspell/hunspell.cxx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/hunspell/hunspell.cxx b/src/hunspell/hunspell.cxx
index 3fd0d16..76e61b1 100644
--- a/src/hunspell/hunspell.cxx
+++ b/src/hunspell/hunspell.cxx
@@ -1069,7 +1069,11 @@ std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
             wspace.append("-");
             wspace.append(scw.substr(dash_pos + 1));
           }
-          insert_sug(slst, wspace);
+          int info = 0;
+          if (pAMgr && pAMgr->get_forbiddenword())
+            checkword(wspace, &info, NULL);
+          if (!(info & SPELL_FORBIDDEN))
+            insert_sug(slst, wspace);
         }
         nodashsug = 0;
       }
-- 
1.9.1

diff --git a/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch b/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch
new file mode 100644
index 0000000..0bf52bd
--- /dev/null
+++ b/external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch
@@ -0,0 +1,43 @@
From 15b2cde4f01706f0a648518a5cfc57394d015448 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
Date: Thu, 12 Oct 2017 16:47:57 +0200
Subject: [PATCH] fix compound handling for new Hungarian orthography

Extend partial fix in commit 42807f970ac2d65f0d13a7c57eb454b210e92240.
---
 src/hunspell/affixmgr.cxx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
index ffce7bb..ea0f0fc 100644
--- a/src/hunspell/affixmgr.cxx
+++ b/src/hunspell/affixmgr.cxx
@@ -1990,6 +1990,8 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
                 std::string tmp(sfxappnd);
                 reverseword(tmp);
                 numsyllable -= get_syllable(tmp) + sfxextra;
+              } else {
+                numsyllable -= sfxextra;
               }
 
               // + 1 word, if syllable number of the prefix > 1 (hungarian
@@ -2024,7 +2026,6 @@ struct hentry* AffixMgr::compound_check(const std::string& word,
                 (TESTAFF(rv->astr, compoundroot, rv->alen))) {
               wordnum++;
             }
-
             // second word is acceptable, as a word with prefix or/and suffix?
             // hungarian conventions: compounding is acceptable,
             // when compound forms consist 2 word, otherwise
@@ -2553,6 +2554,8 @@ int AffixMgr::compound_check_morph(const char* word,
             std::string tmp(sfxappnd);
             reverseword(tmp);
             numsyllable -= get_syllable(tmp) + sfxextra;
+          } else {
+            numsyllable -= sfxextra;
           }
 
           // + 1 word, if syllable number of the prefix > 1 (hungarian
-- 
1.9.1

diff --git a/external/hunspell/0001-fix-compound-word-part-pa.patch b/external/hunspell/0001-fix-compound-word-part-pa.patch
new file mode 100644
index 0000000..152a9ff
--- /dev/null
+++ b/external/hunspell/0001-fix-compound-word-part-pa.patch
@@ -0,0 +1,26 @@
From de3ae6844af62300e473f7b7b66a56e54153b4b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20N=C3=A9meth?= <nemeth@numbertext.org>
Date: Mon, 16 Oct 2017 23:00:23 +0200
Subject: [PATCH] fix compound word part "pa:"

(regression in morphological analysis)
---
 src/hunspell/affixmgr.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hunspell/affixmgr.cxx b/src/hunspell/affixmgr.cxx
index ea0f0fc..52c7fa5 100644
--- a/src/hunspell/affixmgr.cxx
+++ b/src/hunspell/affixmgr.cxx
@@ -2608,7 +2608,7 @@ int AffixMgr::compound_check_morph(const char* word,
           if (!m.empty()) {
             result.push_back(MSEP_FLD);
             result.append(MORPH_PART);
-            result.append(word + 1);
+            result.append(word + i);
             line_uniq_app(m, MSEP_REC);
             result.append(m);
           }
-- 
1.9.1

diff --git a/external/hunspell/UnpackedTarball_hunspell.mk b/external/hunspell/UnpackedTarball_hunspell.mk
index 3bb7e5e..23d3aca 100644
--- a/external/hunspell/UnpackedTarball_hunspell.mk
+++ b/external/hunspell/UnpackedTarball_hunspell.mk
@@ -21,6 +21,12 @@ $(eval $(call gb_UnpackedTarball_set_patchlevel,hunspell,1))

$(eval $(call gb_UnpackedTarball_add_patches,hunspell, \
	external/hunspell/0001-Revert-Remove-autotools-autogenerated-files.patch \
	external/hunspell/0001-Extend-dotless-i-and-dotted-I-rules-to-Crimean-Tatar.patch \
	external/hunspell/0001-Allow-dotted-I-in-dictionary-and-disable-bad-capital.patch \
	external/hunspell/0001-FORBIDDENWORD-precedes-BREAK.patch \
	external/hunspell/0001-Remove-forbidden-words-from-dash-suggestion-list.patch \
	external/hunspell/0001-fix-compound-handling-for-new-Hungarian-orthography.patch \
	external/hunspell/0001-fix-compound-word-part-pa.patch \
))

# vim: set noet sw=4 ts=4: