Commit cd5586af authored by csillag's avatar csillag

Fuzzy anchoring finally fully functional.

- Added	libraries required for fuzzy text matching
- Updated dom-text-* and annotator
parent dcab98ab
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
** Dual licensed under the MIT and GPLv3 licenses. ** Dual licensed under the MIT and GPLv3 licenses.
** https://github.com/okfn/annotator/blob/master/LICENSE ** https://github.com/okfn/annotator/blob/master/LICENSE
** **
** Built at: 2013-03-03 22:52:56Z ** Built at: 2013-03-03 23:59:25Z
*/ */
(function() { (function() {
...@@ -883,9 +883,7 @@ ...@@ -883,9 +883,7 @@
endOffset = (this.domMapper.getInfoForNode(nRange.end)).end; endOffset = (this.domMapper.getInfoForNode(nRange.end)).end;
currentQuote = this.domMapper.getContentForRange(startOffset, endOffset); currentQuote = this.domMapper.getContentForRange(startOffset, endOffset);
if (currentQuote !== savedQuote) { if (currentQuote !== savedQuote) {
console.log("Could not apply XPath selector to current document, because the quote has changed."); console.log("Could not apply XPath selector to current document, because the quote has changed. (Saved quote is '" + savedQuote + "', current quote is '" + currentQuote + "'.)");
console.log("Saved quote is '" + savedQuote + "'.");
console.log("Current quote is '" + currentQuote + "'.");
return null; return null;
} else { } else {
...@@ -913,9 +911,7 @@ ...@@ -913,9 +911,7 @@
savedQuote = this.getQuoteForTarget(target); savedQuote = this.getQuoteForTarget(target);
currentQuote = this.domMapper.getContentForRange(selector.start, selector.end); currentQuote = this.domMapper.getContentForRange(selector.start, selector.end);
if (currentQuote !== savedQuote) { if (currentQuote !== savedQuote) {
console.log("Could not apply position selector to current document, because the quote has changed."); console.log("Could not apply position selector to current document, because the quote has changed. (Saved quote is '" + savedQuote + "', current quote is '" + currentQuote + "'.)");
console.log("Saved quote is '" + savedQuote + "'.");
console.log("Current quote is '" + currentQuote + "'.");
return null; return null;
} else { } else {
...@@ -928,10 +924,29 @@ ...@@ -928,10 +924,29 @@
return browserRange.normalize(); return browserRange.normalize();
}; };
Annotator.prototype.findAnchorWithFuzzyMatching = function(target) {
var browserRange, match, posSelector, quote, quoteSelector, result, start;
quoteSelector = this.findSelector(target.selector, "context+quote");
quote = quoteSelector != null ? quoteSelector.exact : void 0;
if (quote == null) return null;
posSelector = this.findSelector(target.selector, "position");
start = posSelector != null ? posSelector.start : void 0;
start || (start = this.domMapper.getDocLength() / 2);
result = this.domMatcher.searchFuzzy(quote, start);
if (result.matches.length !== 1) return null;
match = result.matches[0];
if (!match.exact) {
console.log("Using fuzzy matching, found '" + match.found + "', instead of '" + quote + "'.");
}
browserRange = new Range.BrowserRange(match.range);
return browserRange.normalize();
};
Annotator.prototype.findAnchor = function(target) { Annotator.prototype.findAnchor = function(target) {
var anchor; var anchor;
anchor = this.findAnchorFromXPathRangeSelector(target); anchor = this.findAnchorFromXPathRangeSelector(target);
anchor || (anchor = this.findAnchorFromPositionSelector(target)); anchor || (anchor = this.findAnchorFromPositionSelector(target));
anchor || (anchor = this.findAnchorWithFuzzyMatching(target));
return anchor; return anchor;
}; };
...@@ -960,16 +975,11 @@ ...@@ -960,16 +975,11 @@
console.log(exception); console.log(exception);
} }
} }
annotation.currentQuote = [];
annotation.currentRanges = [];
annotation.highlights = []; annotation.highlights = [];
for (_l = 0, _len4 = normedRanges.length; _l < _len4; _l++) { for (_l = 0, _len4 = normedRanges.length; _l < _len4; _l++) {
normed = normedRanges[_l]; normed = normedRanges[_l];
annotation.currentQuote.push($.trim(normed.text()));
annotation.currentRanges.push(normed.serialize(this.wrapper[0], '.annotator-hl'));
$.merge(annotation.highlights, this.highlightRange(normed)); $.merge(annotation.highlights, this.highlightRange(normed));
} }
annotation.currentQuote = annotation.currentQuote.join(' / ');
$(annotation.highlights).data('annotation', annotation); $(annotation.highlights).data('annotation', annotation);
return annotation; return annotation;
}; };
......
This diff is collapsed.
...@@ -207,6 +207,8 @@ class window.DomTextMapper ...@@ -207,6 +207,8 @@ class window.DomTextMapper
path ?= @getDefaultPath() path ?= @getDefaultPath()
@path[path].length @path[path].length
getDocLength: -> @getLengthForPath()
# Return a given range of the rendered value of a part of the dom. # Return a given range of the rendered value of a part of the dom.
# If path is not given, the default path is used. # If path is not given, the default path is used.
getContentForRange: (start, end, path = null) -> getContentForRange: (start, end, path = null) ->
......
...@@ -111,7 +111,10 @@ class window.DomTextMatcher ...@@ -111,7 +111,10 @@ class window.DomTextMatcher
# #
# For the details about the returned data structure, see the documentation of the search() method. # For the details about the returned data structure, see the documentation of the search() method.
searchFuzzy: (pattern, pos, caseSensitive = false, matchDistance = 1000, matchThreshold = 0.5, path = null) -> searchFuzzy: (pattern, pos, caseSensitive = false, matchDistance = 1000, matchThreshold = 0.5, path = null) ->
if not @dmp? then @dmp = new window.DTM_DMPMatcher unless @dmp?
unless window.DTM_DMPMatcher?
throw new Error "DTM_DMPMatcher is not available. Have you loaded the text match engines?"
@dmp = new window.DTM_DMPMatcher
@dmp.setMatchDistance matchDistance @dmp.setMatchDistance matchDistance
@dmp.setMatchThreshold matchThreshold @dmp.setMatchThreshold matchThreshold
@dmp.setCaseSensitive caseSensitive @dmp.setCaseSensitive caseSensitive
...@@ -146,7 +149,7 @@ class window.DomTextMatcher ...@@ -146,7 +149,7 @@ class window.DomTextMatcher
unless pattern? then throw new Error "Can't search an for empty pattern!" unless pattern? then throw new Error "Can't search an for empty pattern!"
# Do some preparation, if required # Do some preparation, if required
t0 = @timestamp()# t0 = @timestamp()
if path? then @scan() if path? then @scan()
t1 = @timestamp() t1 = @timestamp()
......
# Naive text matcher
class window.DTM_ExactMatcher
constructor: ->
@distinct = true
@caseSensitive = false
setDistinct: (value) -> @distinct = value
setCaseSensitive: (value) -> @caseSensitive = value
search: (text, pattern) ->
# console.log "Searching for '" + pattern + "' in '" + text + "'."
pLen = pattern.length
results = []
index = 0
unless @caseSensitive
text = text.toLowerCase()
pattern = pattern.toLowerCase()
while (i = text.indexOf pattern) > -1
do =>
# console.log "Found '" + pattern + "' @ " + i + " (=" + (index + i) + ")"
results.push
start: index + i
end: index + i + pLen
if @distinct
text = text.substr i + pLen
index += i + pLen
else
text = text.substr i + 1
index += i + 1
results
class window.DTM_RegexMatcher
constructor: ->
@caseSensitive = false
setCaseSensitive: (value) -> @caseSensitive = value
search: (text, pattern) ->
re = new RegExp pattern, if @caseSensitive then "g" else "gi"
{ start: m.index, end: m.index + m[0].length } while m = re.exec text
# diff-match-patch - based text matcher
class window.DTM_DMPMatcher
constructor: ->
@dmp = new diff_match_patch
@dmp.Diff_Timeout = 0
@caseSensitive = false
_reverse: (text) -> text.split("").reverse().join ""
# Use this to get the max allowed pattern length.
# Trying to use a longer pattern will give an error.
getMaxPatternLength: -> @dmp.Match_MaxBits
# The following example is a classic dilemma.
# There are two potential matches, one is close to the expected location
# but contains a one character error, the other is far from the expected
# location but is exactly the pattern sought after:
#
# match_main("abc12345678901234567890abbc", "abc", 26)
#
# Which result is returned (0 or 24) is determined by the
# MatchDistance property.
#
# An exact letter match which is 'distance' characters away
# from the fuzzy location would score as a complete mismatch.
# For example, a distance of '0' requires the match be at the exact
# location specified, whereas a threshold of '1000' would require
# a perfect match to be within 800 characters of the expected location
# to be found using a 0.8 threshold (see below).
#
# The larger MatchDistance is, the slower search may take to compute.
#
# This variable defaults to 1000.
setMatchDistance: (distance) -> @dmp.Match_Distance = distance
getMatchDistance: -> @dmp.Match_Distance
# MatchThreshold determines the cut-off value for a valid match.
#
# If Match_Threshold is closer to 0, the requirements for accuracy
# increase. If Match_Threshold is closer to 1 then it is more likely
# that a match will be found. The larger Match_Threshold is, the slower
# search may take to compute.
#
# This variable defaults to 0.5.
setMatchThreshold: (threshold) -> @dmp.Match_Threshold = threshold
getMatchThreshold: -> @dmp.Match_Threshold
getCaseSensitive: -> caseSensitive
setCaseSensitive: (value) -> @caseSensitive = value
# Given a text to search, a pattern to search for and an
# expected location in the text near which to find the pattern,
# return the location which matches closest.
#
# The function will search for the best match based on both the number
# of character errors between the pattern and the potential match,
# as well as the distance between the expected location and the
# potential match.
#
# If no match is found, the function returns null.
search: (text, pattern, expectedStartLoc = 0) ->
unless expectedStartLoc >= 0 then throw new Error "Can't search at negavive indices!"
unless @caseSensitive
text = text.toLowerCase()
pattern = pattern.toLowerCase()
results = []
pLen = pattern.length
maxLen = @getMaxPatternLength()
if pLen <= maxLen
results = @searchForSlice text, pattern, expectedStartLoc
else
startSlice = pattern.substr 0, maxLen
startPos = @searchForSlice text, startSlice, expectedStartLoc
if startPos.length
startLen = startPos[0].end - startPos[0].start
endSlice = pattern.substr pLen - maxLen, maxLen
endLoc = startPos[0].start + pLen - maxLen
endPos = @searchForSlice text, endSlice, endLoc
if endPos.length
endLen = endPos[0].end - endPos[0].start
matchLen = endPos[0].end - startPos[0].start
startIndex = startPos[0].start
endIndex = endPos[0].end
found = text.substr startIndex, endIndex - startIndex
diff = @dmp.diff_main pattern, found
lev = @dmp.diff_levenshtein diff
@dmp.diff_cleanupSemantic diff
if pLen*0.5 <= matchLen <= pLen*1.5 then results.push {
start: startIndex
end: endPos[0].end
data:
# startError: startPos[0].data.error
# endError: endPos[0].data.error
# uncheckedMidSection: Math.max 0, matchLen - startLen - endLen
# lengthError: matchLen - pLen
levenshtein: lev
hiddenData:
diff: @dmp.diff_prettyHtml diff
}
# else
# console.log "Sorry, matchLen (" + matchLen + ") is not between " + 0.5*pLen + " and " + 1.5*pLen
# else
# console.log "endSlice ('" + endSlice + "') not found"
# else
# console.log "startSlice ('" + startSlice + "') not found"
return results
# ============= Private part ==========================================
# You don't need to call the functions below this point manually
searchForSlice: (text, slice, expectedStartLoc) ->
r1 = @dmp.match_main text, slice, expectedStartLoc
startIndex = r1.index
if startIndex is -1 then return []
txet = @_reverse text
nrettap = @_reverse slice
expectedEndLoc = startIndex + slice.length
expectedDneLoc = text.length - expectedEndLoc
r2 = @dmp.match_main txet, nrettap, expectedDneLoc
dneIndex = r2.index
endIndex = text.length - dneIndex
found = text.substr startIndex, endIndex - startIndex
diff = @dmp.diff_main slice, found
lev = @dmp.diff_levenshtein diff
@dmp.diff_cleanupSemantic diff
result = [{
start: startIndex
end: endIndex
data:
# error: (Math.round 10000 * r1.error) / 100
levenshtein: lev
hiddenData:
diff: @dmp.diff_prettyHtml diff
}]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment