Merge pull request #1469 from hypothesis/729-unicode-normalization

Normalize for diacritics

Merge pull request #1469 from hypothesis/729-unicode-normalization
Normalize for diacritics
027f2778 · Nick Stenning · f5a37538 · 97f829b2 · 027f2778 · 027f2778
Commit 027f2778 authored Nov 18, 2014 by Nick Stenning
6 changed files
--- a/h/static/scripts/helpers/string-helpers.coffee
+++ b/h/static/scripts/helpers/string-helpers.coffee
+# Shared helper methods for working with strings/unicode strings
+# For unicode normalization we use the unorm library
+createStringHelpers = ->
+  # Current unicode combining characters
+  # from http://xregexp.com/addons/unicode/unicode-categories.js line:30
+  allMarks = /[\u0300-\u036F\u0483-\u0489\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u08E4-\u08FE\u0900-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B62\u0B63\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C01-\u0C03\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63\u0C82\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3\u0D02\u0D03\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63\u0D82\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F8D-\u0F97\u0F99-\u0FBC\u0FC6\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F\u109A-\u109D\u135D-\u135F\u1712-\u1714\u1732-\u1734\u1752\u1753\u1772\u1773\u17B4-\u17D3\u17DD\u180B-\u180D\u18A9\u1920-\u192B\u1930-\u193B\u19B0-\u19C0\u19C8\u19C9\u1A17-\u1A1B\u1A55-\u1A5E\u1A60-\u1A7C\u1A7F\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAD\u1BE6-\u1BF3\u1C24-\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE8\u1CED\u1CF2-\u1CF4\u1DC0-\u1DE6\u1DFC-\u1DFF\u20D0-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F-\uA672\uA674-\uA67D\uA69F\uA6F0\uA6F1\uA802\uA806\uA80B\uA823-\uA827\uA880\uA881\uA8B4-\uA8C4\uA8E0-\uA8F1\uA926-\uA92D\uA947-\uA953\uA980-\uA983\uA9B3-\uA9C0\uAA29-\uAA36\uAA43\uAA4C\uAA4D\uAA7B\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uAAEB-\uAAEF\uAAF5\uAAF6\uABE3-\uABEA\uABEC\uABED\uFB1E\uFE00-\uFE0F\uFE20-\uFE26]/g;
+
+  uniFold: (str) ->
+    # normalize
+    str  = unorm.nfkd(str)
+    # remove all marks
+    str.replace allMarks, ''
+
+angular.module('h.helpers')
+.service('stringHelpers', createStringHelpers)
--- a/h/static/scripts/searchfilters.coffee
+++ b/h/static/scripts/searchfilters.coffee
@@ -2,6 +2,7 @@
 # It expects a search query string where the search term are separated by space character
 # and collects them into the given term arrays
 class SearchFilter
+
  # This function will slice the search-text input
  # Slice character: space,
  # but an expression between quotes (' or ") is considered one
@@ -73,7 +74,7 @@ class SearchFilter
        filter = term.slice 0, term.indexOf ":"
        unless filter? then filter = ""
        switch filter
-          when 'quote' then quote.push term[6..].toLowerCase()
+          when 'quote' then quote.push term[6..]
          when 'result' then result.push term[7..]
          when 'since'
            # We'll turn this into seconds
@@ -109,11 +110,11 @@ class SearchFilter
              # Time given in year
              t = /^(\d+)year$/.exec(time)[1]
              since.push t * 60 * 60 * 24 * 365
-          when 'tag' then tag.push term[4..].toLowerCase()
-          when 'text' then text.push term[5..].toLowerCase()
-          when 'uri' then uri.push term[4..].toLowerCase()
-          when 'user' then user.push term[5..].toLowerCase()
-          else any.push term.toLowerCase()
+          when 'tag' then tag.push term[4..]
+          when 'text' then text.push term[5..]
+          when 'uri' then uri.push term[4..]
+          when 'user' then user.push term[5..]
+          else any.push term

    any:
      terms: any
@@ -140,7 +141,6 @@ class SearchFilter
      terms: user
      operator: 'or'

-
 # This class will process the results of search and generate the correct filter
 # It expects the following dict format as rules
 # { facet_name : {
@@ -152,7 +152,7 @@ class SearchFilter
 #
 #      options: backend specific options
 #      options.es: elasticsearch specific options
-#      options.es.query_type : can be: simple, query_string, match, multi_match
+#      options.es.query_type : can be: simple (term), query_string, match, multi_match
 #         defaults to: simple, determines which es query type to use
 #      options.es.cutoff_frequency: if set, the query will be given a cutoff_frequency for this facet
 #      options.es.and_or: match and multi_match queries can use this, defaults to and
@@ -164,25 +164,20 @@ class QueryParser
  rules:
    user:
      path: '/user'
-      case_sensitive: false
      and_or: 'or'
    text:
      path: '/text'
-      case_sensitive: false
      and_or: 'and'
    tag:
      path: '/tags'
-      case_sensitive: false
      and_or: 'and'
    quote:
      path: '/quote'
-      case_sensitive: false
      and_or: 'and'
    uri:
      formatter: (uri) ->
        uri.toLowerCase()
      path: '/uri'
-      case_sensitive: false
      and_or: 'or'
      options:
        es:
@@ -203,11 +198,9 @@ class QueryParser
            when '1 year' then 365*24*60*60
        new Date(new Date().valueOf() - seconds*1000)
      path: '/created'
-      case_sensitive: true
      and_or: 'and'
      operator: 'ge'
    any:
-      case_sensitive: false
      and_or: 'and'
      path:   ['/quote', '/tags', '/text', '/uri', '/user']
      options:
@@ -225,7 +218,6 @@ class QueryParser
      unless terms.length then continue
      rule = @rules[category] 

-
      # Now generate the clause with the help of the rule
      case_sensitive = if rule.case_sensitive? then rule.case_sensitive else false
      and_or = if rule.and_or? then rule.and_or else 'or'

--- a/h/static/scripts/services.coffee
+++ b/h/static/scripts/services.coffee
@@ -435,6 +435,15 @@ class ViewFilter
    any:
      fields: ['quote', 'text', 'tag', 'user']

+  this.$inject = ['searchfilter','stringHelpers']
+  constructor: (searchfilter, stringHelpers) ->
+    @searchfilter = searchfilter
+
+    @_normalize = (e) ->
+      if typeof e is 'string'
+        return stringHelpers.uniFold(e)
+      else return e
+
  _matches: (filter, value, match) ->
    matches = true

@@ -468,11 +477,12 @@ class ViewFilter

    value = checker.value annotation
    if angular.isArray value
-      if typeof(value[0]) == 'string'
-        value = value.map (v) -> v.toLowerCase()
+      value = value.map (e) -> e.toLowerCase()
+      value = value.map (e) => @_normalize(e)
      return @_arrayMatches filter, value, checker.match
    else
-      value = value.toLowerCase() if typeof(value) == 'string'
+      value = value.toLowerCase()
+      value = @_normalize(value)
      return @_matches filter, value, checker.match

  # Filters a set of annotations, according to a given query.
@@ -497,7 +507,15 @@ class ViewFilter
    limit = Math.min((filters.result?.terms or [])...)
    count = 0

-    results = for annotation in annotations
+    # Normalizing the filters, need to do only once.
+    for _, filter of filters
+      if filter.terms
+        filter.terms = filter.terms.map (e) =>
+          e = e.toLowerCase()
+          e = @_normalize e
+          e
+
+    for annotation in annotations
      break if count >= limit

      match = true

--- a/h/static/scripts/vendor/unorm.js
+++ b/h/static/scripts/vendor/unorm.js
--- a/karma.config.js
+++ b/karma.config.js
@@ -41,6 +41,7 @@ module.exports = function(config) {
      'h/static/scripts/vendor/moment-timezone-data.js',
      'h/static/scripts/vendor/Markdown.Converter.js',
      'h/static/scripts/vendor/sockjs-0.3.4.js',
+      'h/static/scripts/vendor/unorm.js',
      'h/static/scripts/vendor/uuid.js',
      'h/static/scripts/app.js',
      'h/static/scripts/auth.js',

--- a/tests/js/helpers/string-helpers-test.coffee
+++ b/tests/js/helpers/string-helpers-test.coffee
+assert = chai.assert
+sinon.assert.expose(assert, prefix: '')
+
+describe 'h.helpers.stringHelpers', ->
+  stringHelpers = null
+
+  beforeEach module('h.helpers')
+
+  beforeEach inject (_stringHelpers_) ->
+    stringHelpers = _stringHelpers_
+
+  describe '.uniFold', ->
+    it 'removes hungarian marks', ->
+      text = 'Fürge rőt róka túlszökik zsíros étkű kutyán'
+      decoded = stringHelpers.uniFold text
+      expected = 'Furge rot roka tulszokik zsiros etku kutyan'
+
+      assert.equal decoded, expected
+
+    it 'removes greek marks', ->
+      text = 'Καλημέρα κόσμε'
+      decoded = stringHelpers.uniFold text
+      expected = 'Καλημερα κοσμε'
+
+      assert.equal decoded, expected
+
+    it 'removes japanese marks', ->
+      text = 'カタカナコンバータ'
+      decoded = stringHelpers.uniFold text
+      expected = 'カタカナコンハータ'
+
+      assert.equal decoded, expected
+
+    it 'removes marathi marks', ->
+      text = 'काचं शक्नोम्यत्तुम'
+      decoded = stringHelpers.uniFold text
+      expected = 'कच शकनमयततम'
+
+      assert.equal decoded, expected
+
+    it 'removes thai marks', ->
+      text = 'ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ'
+      decoded = stringHelpers.uniFold text
+      expected = 'ฉนกนกระจกได แตมนไมทาใหฉนเจบ'
+
+      assert.equal decoded, expected
+
+    it 'removes all marks', ->
+      text = '̀ ́ ̂ ̃ ̄ ̅ ̆ ̇ ̈ ̉ ̊ ̋ ̌ ̍ ̎ ̏ ̐ ̑ ̒ ̓ ̔ ̕ ̖ ̗ ̘ ̙ ̚ ̛ ̜ ̝ ̞ ̟ ̠ ̡ ̢ ̣ ̤ ̥ ̦ ̧ ̨ ̩ ̪ ̫ ̬ ̭ ̮ ̯ ̰ ̱ ̲ ̳ ̴ ̵ ̶ ̷ ̸ ̹ ̺ ̻ ̼ ̽ ̾ ̿ ̀ ́ ͂ ̓ ̈́ ͅ ͠ ͡"'
+      decoded = stringHelpers.uniFold text
+      expected = '                                                                       "'
+
+      assert.equal decoded, expected