Commit 25ad1a70 authored by Randall Leeds's avatar Randall Leeds

Merge pull request #1473 from hypothesis/vendorfoldercleanup

Convert coffee files in scripts/vendor to js, modify scripts/vendor/.gitignore
parents c55570c6 caeb7b24
*.min.js
dom_text.js
page_text_mapper_core.js
This diff is collapsed.
This diff is collapsed.
# Text search library
class window.DomTextMatcher
constructor: (@corpus) ->
# Search for text using exact string matching
#
# Parameters:
# pattern: what to search for
#
# distinct: forbid overlapping matches? (defaults to true)
#
# caseSensitive: should the search be case sensitive? (defaults to false)
#
#
# For the details about the returned data structure,
# see the documentation of the search() method.
searchExact: (pattern, distinct = true, caseSensitive = false) ->
if not @pm then @pm = new window.DTM_ExactMatcher
@pm.setDistinct(distinct)
@pm.setCaseSensitive(caseSensitive)
@_search @pm, pattern
# Search for text using regular expressions
#
# Parameters:
# pattern: what to search for
#
# caseSensitive: should the search be case sensitive? (defaults to false)
#
# For the details about the returned data structure,
# see the documentation of the search() method.
searchRegex: (pattern, caseSensitive = false) ->
if not @rm then @rm = new window.DTM_RegexMatcher
@rm.setCaseSensitive(caseSensitive)
@_search @rm, pattern
# Search for text using fuzzy text matching
#
# Parameters:
# pattern: what to search for
#
# pos: where to start searching
#
# caseSensitive: should the search be case sensitive? (defaults to false)
#
# matchDistance and
# matchThreshold:
# fine-tuning parameters for the d-m-p library.
# See http://code.google.com/p/google-diff-match-patch/wiki/API for details.
#
# For the details about the returned data structure,
# see the documentation of the search() method.
searchFuzzy: (pattern, pos, caseSensitive = false, options = {}) ->
@ensureDMP()
@dmp.setMatchDistance options.matchDistance ? 1000
@dmp.setMatchThreshold options.matchThreshold ? 0.5
@dmp.setCaseSensitive caseSensitive
@_search @dmp, pattern, pos, options
searchFuzzyWithContext: (prefix, suffix, pattern, expectedStart = null, expectedEnd = null, caseSensitive = false, options = {}) ->
@ensureDMP()
# No context, to joy
unless (prefix? and suffix?)
throw new Error "Can not do a context-based fuzzy search
with missing context!"
# Get full document length
len = @corpus().length
# Get a starting position for the prefix search
expectedPrefixStart = if expectedStart?
i = expectedStart - prefix.length
if i < 0
0
else i
else
Math.floor(len / 2)
# Do the fuzzy search for the prefix
@dmp.setMatchDistance options.contextMatchDistance ? len * 2
@dmp.setMatchThreshold options.contextMatchThreshold ? 0.5
prefixResult = @dmp.search @corpus(), prefix, expectedPrefixStart
# If the prefix is not found, give up
unless prefixResult.length then return matches: []
# This is where the prefix was found
prefixStart = prefixResult[0].start
prefixEnd = prefixResult[0].end
# Let's find out where do we expect to find the suffix!
# We need the pattern's length.
patternLength = if pattern?
# If we have a pattern, use it's length
pattern.length
else if expectedStart? and expectedEnd?
# We don't have a pattern, but at least
# have valid expectedStart and expectedEnd values,
# get a length from that.
expectedEnd - expectedStart
else
# We have no idea about where the suffix could be.
# Let's just pull a number out of ... thin air.
64
# Get the part of text that is after the prefix
remainingText = @corpus().substr prefixEnd
# Calculate expected position
expectedSuffixStart = patternLength
# Do the fuzzy search for the suffix
suffixResult = @dmp.search remainingText, suffix, expectedSuffixStart
# If the suffix is not found, give up
unless suffixResult.length then return matches: []
# This is where the suffix was found
suffixStart = prefixEnd + suffixResult[0].start
suffixEnd = prefixEnd + suffixResult[0].end
# This if the range between the prefix and the suffix
charRange =
start: prefixEnd
end: suffixStart
# Get the configured threshold for the pattern matching
matchThreshold = options.patternMatchThreshold ? 0.5
# See how good a match we have
analysis = @_analyzeMatch pattern, charRange, true
# Should we try to find a better match by moving the
# initial match around a little bit, even if this has
# a negative impact on the similarity of the context?
if pattern? and options.flexContext and not analysis.exact
# Do we have and exact match for the quote around here?
if not @pm then @pm = new window.DTM_ExactMatcher
@pm.setDistinct false
@pm.setCaseSensitive false
flexMatches = @pm.search @corpus()[prefixStart..suffixEnd], pattern
delete candidate
bestError = 2
for flexMatch in flexMatches
# Calculate the range that matched the quote
flexRange =
start: prefixStart + flexMatch.start
end: prefixStart + flexMatch.end
# Check how the prefix would fare
prefixRange = start: prefixStart, end: flexRange.start
a1 = @_analyzeMatch prefix, prefixRange, true
prefixError = if a1.exact then 0 else a1.comparison.errorLevel
# Check how the suffix would fare
suffixRange = start: flexRange.end, end: suffixEnd
a2 = @_analyzeMatch suffix, suffixRange, true
suffixError = if a2.exact then 0 else a2.comparison.errorLevel
# Did we at least one match?
if a1.exact or a2.exact
# Yes, we did. Calculate the total error
totalError = prefixError + suffixError
# Is this better than our best bet?
if totalError < bestError
# This is our best candidate so far. Store it.
candidate = flexRange
bestError = totalError
if candidate?
console.log "flexContext adjustment: we found a better candidate!"
charRange = candidate
analysis = @_analyzeMatch pattern, charRange, true
# Do we have to compare what we found to a pattern?
if (not pattern?) or # "No pattern, nothing to compare. Assume it's OK."
analysis.exact or # "Found text matches exactly to pattern"
(analysis.comparison.errorLevel <= matchThreshold) # still acceptable
# Collect the results
match = {}
for obj in [charRange, analysis]
for k, v of obj
match[k] = v
return matches: [match]
# console.log "Rejecting the match, because error level is too high. (" +
# errorLevel + ")"
return matches: []
# ===== Private methods (never call from outside the module) =======
# Do some normalization to get a "canonical" form of a string.
# Used to even out some browser differences.
_normalizeString: (string) -> (string.replace /\s{2,}/g, " ").trim()
# Search for text with a custom matcher object
#
# Parameters:
# matcher: the object to use for doing the plain-text part of the search
# pattern: what to search for
# pos: where do we expect to find it
#
# A list of matches is returned.
#
# Each match has "start", "end", "found" and "nodes" fields.
# start and end specify where the pattern was found;
# "found" is the matching slice.
# Nodes is the list of matching nodes, with details about the matches.
#
# If no match is found, an empty list is returned.
_search: (matcher, pattern, pos, options = {}) ->
# Prepare and check the pattern
unless pattern? then throw new Error "Can't search for null pattern!"
pattern = pattern.trim()
unless pattern? then throw new Error "Can't search an for empty pattern!"
fuzzyComparison = options.withFuzzyComparison ? false
t1 = @timestamp()
# Do the text search
textMatches = matcher.search @corpus(), pattern, pos, options
t2 = @timestamp()
matches = []
for textMatch in textMatches
do (textMatch) =>
# See how good a match we have
analysis = @_analyzeMatch pattern, textMatch, fuzzyComparison
# Collect the results
match = {}
for obj in [textMatch, analysis]
for k, v of obj
match[k] = v
matches.push match
null
t3 = @timestamp()
result =
matches: matches
time:
phase1_textMatching: t2 - t1
phase2_matchMapping: t3 - t2
total: t3 - t1
result
timestamp: -> new Date().getTime()
# Read a match returned by the matcher engine, and compare it with the pattern
_analyzeMatch: (pattern, charRange, useFuzzy = false) ->
expected = @_normalizeString pattern
found = @_normalizeString @corpus()[charRange.start .. charRange.end - 1]
result =
found: found
exact: found is expected
# If the match is not exact, check whether the changes are
# only case differences
unless result.exact then result.exactExceptCase =
expected.toLowerCase() is found.toLowerCase()
# if we are interested in fuzzy comparison, calculate that, too
if not result.exact and useFuzzy
@ensureDMP()
result.comparison = @dmp.compare expected, found
result
ensureDMP: ->
unless @dmp?
unless window.DTM_DMPMatcher?
throw new Error "DTM_DMPMatcher is not available.
Have you loaded the text match engines?"
@dmp = new window.DTM_DMPMatcher
// Generated by CoffeeScript 1.7.1
(function() {
window.DomTextMatcher = (function() {
function DomTextMatcher(corpus) {
this.corpus = corpus;
}
DomTextMatcher.prototype.searchExact = function(pattern, distinct, caseSensitive) {
if (distinct == null) {
distinct = true;
}
if (caseSensitive == null) {
caseSensitive = false;
}
if (!this.pm) {
this.pm = new window.DTM_ExactMatcher;
}
this.pm.setDistinct(distinct);
this.pm.setCaseSensitive(caseSensitive);
return this._search(this.pm, pattern);
};
DomTextMatcher.prototype.searchRegex = function(pattern, caseSensitive) {
if (caseSensitive == null) {
caseSensitive = false;
}
if (!this.rm) {
this.rm = new window.DTM_RegexMatcher;
}
this.rm.setCaseSensitive(caseSensitive);
return this._search(this.rm, pattern);
};
DomTextMatcher.prototype.searchFuzzy = function(pattern, pos, caseSensitive, options) {
var _ref, _ref1;
if (caseSensitive == null) {
caseSensitive = false;
}
if (options == null) {
options = {};
}
this.ensureDMP();
this.dmp.setMatchDistance((_ref = options.matchDistance) != null ? _ref : 1000);
this.dmp.setMatchThreshold((_ref1 = options.matchThreshold) != null ? _ref1 : 0.5);
this.dmp.setCaseSensitive(caseSensitive);
return this._search(this.dmp, pattern, pos, options);
};
DomTextMatcher.prototype.searchFuzzyWithContext = function(prefix, suffix, pattern, expectedStart, expectedEnd, caseSensitive, options) {
var a1, a2, analysis, bestError, candidate, charRange, expectedPrefixStart, expectedSuffixStart, flexMatch, flexMatches, flexRange, i, k, len, match, matchThreshold, obj, patternLength, prefixEnd, prefixError, prefixRange, prefixResult, prefixStart, remainingText, suffixEnd, suffixError, suffixRange, suffixResult, suffixStart, totalError, v, _i, _j, _len, _len1, _ref, _ref1, _ref2, _ref3;
if (expectedStart == null) {
expectedStart = null;
}
if (expectedEnd == null) {
expectedEnd = null;
}
if (caseSensitive == null) {
caseSensitive = false;
}
if (options == null) {
options = {};
}
this.ensureDMP();
if (!((prefix != null) && (suffix != null))) {
throw new Error("Can not do a context-based fuzzy search with missing context!");
}
len = this.corpus().length;
expectedPrefixStart = expectedStart != null ? (i = expectedStart - prefix.length, i < 0 ? 0 : i) : Math.floor(len / 2);
this.dmp.setMatchDistance((_ref = options.contextMatchDistance) != null ? _ref : len * 2);
this.dmp.setMatchThreshold((_ref1 = options.contextMatchThreshold) != null ? _ref1 : 0.5);
prefixResult = this.dmp.search(this.corpus(), prefix, expectedPrefixStart);
if (!prefixResult.length) {
return {
matches: []
};
}
prefixStart = prefixResult[0].start;
prefixEnd = prefixResult[0].end;
patternLength = pattern != null ? pattern.length : (expectedStart != null) && (expectedEnd != null) ? expectedEnd - expectedStart : 64;
remainingText = this.corpus().substr(prefixEnd);
expectedSuffixStart = patternLength;
suffixResult = this.dmp.search(remainingText, suffix, expectedSuffixStart);
if (!suffixResult.length) {
return {
matches: []
};
}
suffixStart = prefixEnd + suffixResult[0].start;
suffixEnd = prefixEnd + suffixResult[0].end;
charRange = {
start: prefixEnd,
end: suffixStart
};
matchThreshold = (_ref2 = options.patternMatchThreshold) != null ? _ref2 : 0.5;
analysis = this._analyzeMatch(pattern, charRange, true);
if ((pattern != null) && options.flexContext && !analysis.exact) {
if (!this.pm) {
this.pm = new window.DTM_ExactMatcher;
}
this.pm.setDistinct(false);
this.pm.setCaseSensitive(false);
flexMatches = this.pm.search(this.corpus().slice(prefixStart, +suffixEnd + 1 || 9e9), pattern);
delete candidate;
bestError = 2;
for (_i = 0, _len = flexMatches.length; _i < _len; _i++) {
flexMatch = flexMatches[_i];
flexRange = {
start: prefixStart + flexMatch.start,
end: prefixStart + flexMatch.end
};
prefixRange = {
start: prefixStart,
end: flexRange.start
};
a1 = this._analyzeMatch(prefix, prefixRange, true);
prefixError = a1.exact ? 0 : a1.comparison.errorLevel;
suffixRange = {
start: flexRange.end,
end: suffixEnd
};
a2 = this._analyzeMatch(suffix, suffixRange, true);
suffixError = a2.exact ? 0 : a2.comparison.errorLevel;
if (a1.exact || a2.exact) {
totalError = prefixError + suffixError;
if (totalError < bestError) {
candidate = flexRange;
bestError = totalError;
}
}
}
if (candidate != null) {
console.log("flexContext adjustment: we found a better candidate!");
charRange = candidate;
analysis = this._analyzeMatch(pattern, charRange, true);
}
}
if ((pattern == null) || analysis.exact || (analysis.comparison.errorLevel <= matchThreshold)) {
match = {};
_ref3 = [charRange, analysis];
for (_j = 0, _len1 = _ref3.length; _j < _len1; _j++) {
obj = _ref3[_j];
for (k in obj) {
v = obj[k];
match[k] = v;
}
}
return {
matches: [match]
};
}
return {
matches: []
};
};
DomTextMatcher.prototype._normalizeString = function(string) {
return (string.replace(/\s{2,}/g, " ")).trim();
};
DomTextMatcher.prototype._search = function(matcher, pattern, pos, options) {
var fuzzyComparison, matches, result, t1, t2, t3, textMatch, textMatches, _fn, _i, _len, _ref;
if (options == null) {
options = {};
}
if (pattern == null) {
throw new Error("Can't search for null pattern!");
}
pattern = pattern.trim();
if (pattern == null) {
throw new Error("Can't search an for empty pattern!");
}
fuzzyComparison = (_ref = options.withFuzzyComparison) != null ? _ref : false;
t1 = this.timestamp();
textMatches = matcher.search(this.corpus(), pattern, pos, options);
t2 = this.timestamp();
matches = [];
_fn = (function(_this) {
return function(textMatch) {
var analysis, k, match, obj, v, _j, _len1, _ref1;
analysis = _this._analyzeMatch(pattern, textMatch, fuzzyComparison);
match = {};
_ref1 = [textMatch, analysis];
for (_j = 0, _len1 = _ref1.length; _j < _len1; _j++) {
obj = _ref1[_j];
for (k in obj) {
v = obj[k];
match[k] = v;
}
}
matches.push(match);
return null;
};
})(this);
for (_i = 0, _len = textMatches.length; _i < _len; _i++) {
textMatch = textMatches[_i];
_fn(textMatch);
}
t3 = this.timestamp();
result = {
matches: matches,
time: {
phase1_textMatching: t2 - t1,
phase2_matchMapping: t3 - t2,
total: t3 - t1
}
};
return result;
};
DomTextMatcher.prototype.timestamp = function() {
return new Date().getTime();
};
DomTextMatcher.prototype._analyzeMatch = function(pattern, charRange, useFuzzy) {
var expected, found, result;
if (useFuzzy == null) {
useFuzzy = false;
}
expected = this._normalizeString(pattern);
found = this._normalizeString(this.corpus().slice(charRange.start, +(charRange.end - 1) + 1 || 9e9));
result = {
found: found,
exact: found === expected
};
if (!result.exact) {
result.exactExceptCase = expected.toLowerCase() === found.toLowerCase();
}
if (!result.exact && useFuzzy) {
this.ensureDMP();
result.comparison = this.dmp.compare(expected, found);
}
return result;
};
DomTextMatcher.prototype.ensureDMP = function() {
if (this.dmp == null) {
if (window.DTM_DMPMatcher == null) {
throw new Error("DTM_DMPMatcher is not available. Have you loaded the text match engines?");
}
return this.dmp = new window.DTM_DMPMatcher;
}
};
return DomTextMatcher;
})();
}).call(this);
# Common functions for all page-based document mapper modules
class window.PageTextMapperCore
CONTEXT_LEN: 32
# Get the page index for a given character position
getPageIndexForPos: (pos) ->
for info in @pageInfo
if info.start <= pos < info.end
return info.index
console.log "Not on page " + info.index
return -1
# A new page was rendered
_onPageRendered: (index) =>
#console.log "Allegedly rendered page #" + index
# Is it really rendered?
unless @_isPageRendered(index) and @pageInfo[index]
# console.log "Page #" + index + " is not really rendered yet."
setTimeout (=> @_onPageRendered index), 1000
return
# Collect info about the new DOM subtree
@_mapPage @pageInfo[index]
# Determine whether a given page has been rendered and mapped
isPageMapped: (index) ->
return @pageInfo[index]?.domMapper?
# Create the mappings for a given page
_mapPage: (info) ->
# console.log "Mapping page", info.index
info.node = @getRootNodeForPage info.index
info.domMapper = new DomTextMapper("d-t-m for page #" + info.index)
info.domMapper.setRootNode info.node
info.domMapper.documentChanged()
if @requiresSmartStringPadding
info.domMapper.setExpectedContent info.content
info.domMapper.scan()
renderedContent = info.domMapper.getCorpus()
if renderedContent isnt info.content
console.log "Oops. Mismatch between rendered and extracted text, while mapping page #" + info.index + "!"
console.trace()
console.log "Rendered: " + renderedContent
console.log "Extracted: " + info.content
# Announce the newly available page
setTimeout ->
event = document.createEvent "UIEvents"
event.initUIEvent "docPageMapped", false, false, window, 0
event.pageIndex = info.index
window.dispatchEvent event
# Update the mappings for a given page
_updateMap: (info) ->
#console.log "Updating mappings for page #" + info.index
info.domMapper.documentChanged()
info.domMapper.scan()
# Delete the mappings for a given page
_unmapPage: (info) ->
delete info.domMapper
# Announce the unavailable page
event = document.createEvent "UIEvents"
event.initUIEvent "docPageUnmapped", false, false, window, 0
event.pageIndex = info.index
window.dispatchEvent event
# Announce scrolling
_onScroll: ->
event = document.createEvent "UIEvents"
event.initUIEvent "docPageScrolling", false, false, window, 0
window.dispatchEvent event
# Look up info about a give DOM node, uniting page and node info
getInfoForNode: (node) ->
pageData = @getPageForNode node
# Give up if the given page is not mapped yet
return null unless pageData.domMapper
nodeData = pageData.domMapper.getInfoForNode node
# Copy info about the node
info = {}
for k,v of nodeData
info[k] = v
# Correct the chatacter offsets with that of the page
info.start += pageData.start
info.end += pageData.start
info.pageIndex = pageData.index
info
# Look up the start offset of a give DOM node, uniting page and node info
getStartPosForNode: (node) ->
pageData = @getPageForNode node
nodeStart = pageData.domMapper.getStartPosForNode node
pageData.start + nodeStart
# Look up the end offset of a give DOM node, uniting page and node info
getEndPosForNode: (node) ->
pageData = @getPageForNode node
nodeEnd = pageData.domMapper.getEndPosForNode node
pageData.start + nodeEnd
# Return some data about a given character range
getMappingsForCharRange: (start, end, pages) ->
#console.log "Get mappings for char range [" + start + "; " + end + "], for pages " + pages + "."
# Check out which pages are these on
startIndex = @getPageIndexForPos start
endIndex = @getPageIndexForPos end
#console.log "These are on pages [" + startIndex + ".." + endIndex + "]."
# Function to get the relevant section inside a given page
getSection = (index) =>
info = @pageInfo[index]
# Calculate in-page offsets
realStart = (Math.max info.start, start) - info.start
realEnd = (Math.min info.end, end) - info.start
# Get the range inside the page
mappings = info.domMapper.getMappingsForCharRange realStart, realEnd
mappings.sections[0]
# Get the section for all involved pages
sections = {}
for index in pages ? [startIndex..endIndex]
sections[index] = getSection index
# Return the data
sections: sections
getCorpus: ->
unless @_corpus
throw new Error "Hey! Called getCorpus() before corpus defined!"
@_corpus
getContextForCharRange: (start, end) ->
prefixStart = Math.max 0, start - @CONTEXT_LEN
prefixLen = start - prefixStart
prefix = @_corpus.substr prefixStart, prefixLen
suffix = @_corpus.substr end, @CONTEXT_LEN
[prefix.trim(), suffix.trim()]
# Call this in scan, when you have the page contents
_onHavePageContents: ->
# Join all the text together
@_corpus = (info.content for info in @pageInfo).join " "
# Go over the pages, and calculate some basic info
pos = 0
@pageInfo.forEach (info, i) =>
info.len = info.content.length
info.start = pos
info.end = (pos += info.len + 1)
# Call this in scan, after resolving the promise
_onAfterScan: ->
# Go over the pages again, and map the rendered ones
@pageInfo.forEach (info, i) =>
if @_isPageRendered i
@_mapPage info
// Generated by CoffeeScript 1.7.1
(function() {
var __bind = function(fn, me){ return function(){ return fn.apply(me, arguments); }; };
window.PageTextMapperCore = (function() {
function PageTextMapperCore() {
this._onPageRendered = __bind(this._onPageRendered, this);
}
PageTextMapperCore.prototype.CONTEXT_LEN = 32;
PageTextMapperCore.prototype.getPageIndexForPos = function(pos) {
var info, _i, _len, _ref;
_ref = this.pageInfo;
for (_i = 0, _len = _ref.length; _i < _len; _i++) {
info = _ref[_i];
if ((info.start <= pos && pos < info.end)) {
return info.index;
console.log("Not on page " + info.index);
}
}
return -1;
};
PageTextMapperCore.prototype._onPageRendered = function(index) {
if (!(this._isPageRendered(index) && this.pageInfo[index])) {
setTimeout(((function(_this) {
return function() {
return _this._onPageRendered(index);
};
})(this)), 1000);
return;
}
return this._mapPage(this.pageInfo[index]);
};
PageTextMapperCore.prototype.isPageMapped = function(index) {
var _ref;
return ((_ref = this.pageInfo[index]) != null ? _ref.domMapper : void 0) != null;
};
PageTextMapperCore.prototype._mapPage = function(info) {
var renderedContent;
info.node = this.getRootNodeForPage(info.index);
info.domMapper = new DomTextMapper("d-t-m for page #" + info.index);
info.domMapper.setRootNode(info.node);
info.domMapper.documentChanged();
if (this.requiresSmartStringPadding) {
info.domMapper.setExpectedContent(info.content);
}
info.domMapper.scan();
renderedContent = info.domMapper.getCorpus();
if (renderedContent !== info.content) {
console.log("Oops. Mismatch between rendered and extracted text, while mapping page #" + info.index + "!");
console.trace();
console.log("Rendered: " + renderedContent);
console.log("Extracted: " + info.content);
}
return setTimeout(function() {
var event;
event = document.createEvent("UIEvents");
event.initUIEvent("docPageMapped", false, false, window, 0);
event.pageIndex = info.index;
return window.dispatchEvent(event);
});
};
PageTextMapperCore.prototype._updateMap = function(info) {
info.domMapper.documentChanged();
return info.domMapper.scan();
};
PageTextMapperCore.prototype._unmapPage = function(info) {
var event;
delete info.domMapper;
event = document.createEvent("UIEvents");
event.initUIEvent("docPageUnmapped", false, false, window, 0);
event.pageIndex = info.index;
return window.dispatchEvent(event);
};
PageTextMapperCore.prototype._onScroll = function() {
var event;
event = document.createEvent("UIEvents");
event.initUIEvent("docPageScrolling", false, false, window, 0);
return window.dispatchEvent(event);
};
PageTextMapperCore.prototype.getInfoForNode = function(node) {
var info, k, nodeData, pageData, v;
pageData = this.getPageForNode(node);
if (!pageData.domMapper) {
return null;
}
nodeData = pageData.domMapper.getInfoForNode(node);
info = {};
for (k in nodeData) {
v = nodeData[k];
info[k] = v;
}
info.start += pageData.start;
info.end += pageData.start;
info.pageIndex = pageData.index;
return info;
};
PageTextMapperCore.prototype.getStartPosForNode = function(node) {
var nodeStart, pageData;
pageData = this.getPageForNode(node);
nodeStart = pageData.domMapper.getStartPosForNode(node);
return pageData.start + nodeStart;
};
PageTextMapperCore.prototype.getEndPosForNode = function(node) {
var nodeEnd, pageData;
pageData = this.getPageForNode(node);
nodeEnd = pageData.domMapper.getEndPosForNode(node);
return pageData.start + nodeEnd;
};
PageTextMapperCore.prototype.getMappingsForCharRange = function(start, end, pages) {
var endIndex, getSection, index, sections, startIndex, _i, _j, _len, _ref, _results;
startIndex = this.getPageIndexForPos(start);
endIndex = this.getPageIndexForPos(end);
getSection = (function(_this) {
return function(index) {
var info, mappings, realEnd, realStart;
info = _this.pageInfo[index];
realStart = (Math.max(info.start, start)) - info.start;
realEnd = (Math.min(info.end, end)) - info.start;
mappings = info.domMapper.getMappingsForCharRange(realStart, realEnd);
return mappings.sections[0];
};
})(this);
sections = {};
_ref = pages != null ? pages : (function() {
_results = [];
for (var _j = startIndex; startIndex <= endIndex ? _j <= endIndex : _j >= endIndex; startIndex <= endIndex ? _j++ : _j--){ _results.push(_j); }
return _results;
}).apply(this);
for (_i = 0, _len = _ref.length; _i < _len; _i++) {
index = _ref[_i];
sections[index] = getSection(index);
}
return {
sections: sections
};
};
PageTextMapperCore.prototype.getCorpus = function() {
if (!this._corpus) {
throw new Error("Hey! Called getCorpus() before corpus defined!");
}
return this._corpus;
};
PageTextMapperCore.prototype.getContextForCharRange = function(start, end) {
var prefix, prefixLen, prefixStart, suffix;
prefixStart = Math.max(0, start - this.CONTEXT_LEN);
prefixLen = start - prefixStart;
prefix = this._corpus.substr(prefixStart, prefixLen);
suffix = this._corpus.substr(end, this.CONTEXT_LEN);
return [prefix.trim(), suffix.trim()];
};
PageTextMapperCore.prototype._onHavePageContents = function() {
var info, pos;
this._corpus = ((function() {
var _i, _len, _ref, _results;
_ref = this.pageInfo;
_results = [];
for (_i = 0, _len = _ref.length; _i < _len; _i++) {
info = _ref[_i];
_results.push(info.content);
}
return _results;
}).call(this)).join(" ");
pos = 0;
return this.pageInfo.forEach((function(_this) {
return function(info, i) {
info.len = info.content.length;
info.start = pos;
return info.end = (pos += info.len + 1);
};
})(this));
};
PageTextMapperCore.prototype._onAfterScan = function() {
return this.pageInfo.forEach((function(_this) {
return function(info, i) {
if (_this._isPageRendered(i)) {
return _this._mapPage(info);
}
};
})(this));
};
return PageTextMapperCore;
})();
}).call(this);
# Naive text matcher
class window.DTM_ExactMatcher
constructor: ->
@distinct = true
@caseSensitive = false
setDistinct: (value) -> @distinct = value
setCaseSensitive: (value) -> @caseSensitive = value
search: (text, pattern) ->
# console.log "Searching for '" + pattern + "' in '" + text + "'."
pLen = pattern.length
results = []
index = 0
unless @caseSensitive
text = text.toLowerCase()
pattern = pattern.toLowerCase()
while (i = text.indexOf pattern) > -1
do =>
# console.log "Found '" + pattern + "' @ " + i +
# " (=" + (index + i) + ")"
results.push
start: index + i
end: index + i + pLen
if @distinct
text = text.substr i + pLen
index += i + pLen
else
text = text.substr i + 1
index += i + 1
results
class window.DTM_RegexMatcher
constructor: ->
@caseSensitive = false
setCaseSensitive: (value) -> @caseSensitive = value
search: (text, pattern) ->
re = new RegExp pattern, if @caseSensitive then "g" else "gi"
{ start: m.index, end: m.index + m[0].length } while m = re.exec text
# diff-match-patch - based text matcher
class window.DTM_DMPMatcher
constructor: ->
@dmp = new diff_match_patch
@dmp.Diff_Timeout = 0
@caseSensitive = false
_reverse: (text) -> text.split("").reverse().join ""
# Use this to get the max allowed pattern length.
# Trying to use a longer pattern will give an error.
getMaxPatternLength: -> @dmp.Match_MaxBits
# The following example is a classic dilemma.
# There are two potential matches, one is close to the expected location
# but contains a one character error, the other is far from the expected
# location but is exactly the pattern sought after:
#
# match_main("abc12345678901234567890abbc", "abc", 26)
#
# Which result is returned (0 or 24) is determined by the
# MatchDistance property.
#
# An exact letter match which is 'distance' characters away
# from the fuzzy location would score as a complete mismatch.
# For example, a distance of '0' requires the match be at the exact
# location specified, whereas a threshold of '1000' would require
# a perfect match to be within 800 characters of the expected location
# to be found using a 0.8 threshold (see below).
#
# The larger MatchDistance is, the slower search may take to compute.
#
# This variable defaults to 1000.
setMatchDistance: (distance) -> @dmp.Match_Distance = distance
getMatchDistance: -> @dmp.Match_Distance
# MatchThreshold determines the cut-off value for a valid match.
#
# If Match_Threshold is closer to 0, the requirements for accuracy
# increase. If Match_Threshold is closer to 1 then it is more likely
# that a match will be found. The larger Match_Threshold is, the slower
# search may take to compute.
#
# This variable defaults to 0.5.
setMatchThreshold: (threshold) -> @dmp.Match_Threshold = threshold
getMatchThreshold: -> @dmp.Match_Threshold
getCaseSensitive: -> caseSensitive
setCaseSensitive: (value) -> @caseSensitive = value
# Given a text to search, a pattern to search for and an
# expected location in the text near which to find the pattern,
# return the location which matches closest.
#
# The function will search for the best match based on both the number
# of character errors between the pattern and the potential match,
# as well as the distance between the expected location and the
# potential match.
#
# If no match is found, the function returns null.
search: (text, pattern, expectedStartLoc = 0, options = {}) ->
# console.log "In dtm search. text: '" + text + "', pattern: '" + pattern +
# "', expectedStartLoc: " + expectedStartLoc + ", options:"
# console.log options
if expectedStartLoc < 0
throw new Error "Can't search at negative indices!"
if expectedStartLoc isnt Math.floor expectedStartLoc
throw new Error "Expected start location must be an integer."
unless @caseSensitive
text = text.toLowerCase()
pattern = pattern.toLowerCase()
pLen = pattern.length
maxLen = @getMaxPatternLength()
if pLen <= maxLen
result = @searchForSlice text, pattern, expectedStartLoc
else
startSlice = pattern.substr 0, maxLen
startPos = @searchForSlice text, startSlice, expectedStartLoc
if startPos?
startLen = startPos.end - startPos.start
endSlice = pattern.substr pLen - maxLen, maxLen
endLoc = startPos.start + pLen - maxLen
endPos = @searchForSlice text, endSlice, endLoc
if endPos?
endLen = endPos.end - endPos.start
matchLen = endPos.end - startPos.start
startIndex = startPos.start
endIndex = endPos.end
if pLen*0.5 <= matchLen <= pLen*1.5
result =
start: startIndex
end: endPos.end
# data:
# startError: startPos.data.error
# endError: endPos.data.error
# uncheckedMidSection: Math.max 0, matchLen - startLen - endLen
# lengthError: matchLen - pLen
# else
# console.log "Sorry, matchLen (" + matchLen + ") is not between " +
# 0.5*pLen + " and " + 1.5*pLen
# else
# console.log "endSlice ('" + endSlice + "') not found"
# else
# console.log "startSlice ('" + startSlice + "') not found"
unless result? then return []
if options.withLevenhstein or options.withDiff
found = text.substr result.start, result.end - result.start
result.diff = @dmp.diff_main pattern, found
if options.withLevenshstein
result.lev = @dmp.diff_levenshtein result.diff
if options.withDiff
@dmp.diff_cleanupSemantic result.diff
result.diffHTML = @dmp.diff_prettyHtml result.diff
[result]
# Compare two string slices, get Levenhstein and visual diff
compare: (text1, text2) ->
unless (text1? and text2?)
throw new Error "Can not compare non-existing strings!"
result = {}
result.diff = @dmp.diff_main text1, text2
result.lev = @dmp.diff_levenshtein result.diff
result.errorLevel = result.lev / text1.length
@dmp.diff_cleanupSemantic result.diff
result.diffHTML = @dmp.diff_prettyHtml result.diff
result
# ============= Private part ==========================================
# You don't need to call the functions below this point manually
searchForSlice: (text, slice, expectedStartLoc) ->
# console.log "searchForSlice: '" + text + "', '" + slice + "', " +
# expectedStartLoc
r1 = @dmp.match_main text, slice, expectedStartLoc
startIndex = r1.index
if startIndex is -1 then return null
txet = @_reverse text
nrettap = @_reverse slice
expectedEndLoc = startIndex + slice.length
expectedDneLoc = text.length - expectedEndLoc
r2 = @dmp.match_main txet, nrettap, expectedDneLoc
dneIndex = r2.index
endIndex = text.length - dneIndex
result =
start: startIndex
end: endIndex
// Generated by CoffeeScript 1.7.1
(function() {
window.DTM_ExactMatcher = (function() {
function DTM_ExactMatcher() {
this.distinct = true;
this.caseSensitive = false;
}
DTM_ExactMatcher.prototype.setDistinct = function(value) {
return this.distinct = value;
};
DTM_ExactMatcher.prototype.setCaseSensitive = function(value) {
return this.caseSensitive = value;
};
DTM_ExactMatcher.prototype.search = function(text, pattern) {
var i, index, pLen, results;
pLen = pattern.length;
results = [];
index = 0;
if (!this.caseSensitive) {
text = text.toLowerCase();
pattern = pattern.toLowerCase();
}
while ((i = text.indexOf(pattern)) > -1) {
(function(_this) {
return (function() {
results.push({
start: index + i,
end: index + i + pLen
});
if (_this.distinct) {
text = text.substr(i + pLen);
return index += i + pLen;
} else {
text = text.substr(i + 1);
return index += i + 1;
}
});
})(this)();
}
return results;
};
return DTM_ExactMatcher;
})();
window.DTM_RegexMatcher = (function() {
function DTM_RegexMatcher() {
this.caseSensitive = false;
}
DTM_RegexMatcher.prototype.setCaseSensitive = function(value) {
return this.caseSensitive = value;
};
DTM_RegexMatcher.prototype.search = function(text, pattern) {
var m, re, _results;
re = new RegExp(pattern, this.caseSensitive ? "g" : "gi");
_results = [];
while (m = re.exec(text)) {
_results.push({
start: m.index,
end: m.index + m[0].length
});
}
return _results;
};
return DTM_RegexMatcher;
})();
window.DTM_DMPMatcher = (function() {
function DTM_DMPMatcher() {
this.dmp = new diff_match_patch;
this.dmp.Diff_Timeout = 0;
this.caseSensitive = false;
}
DTM_DMPMatcher.prototype._reverse = function(text) {
return text.split("").reverse().join("");
};
DTM_DMPMatcher.prototype.getMaxPatternLength = function() {
return this.dmp.Match_MaxBits;
};
DTM_DMPMatcher.prototype.setMatchDistance = function(distance) {
return this.dmp.Match_Distance = distance;
};
DTM_DMPMatcher.prototype.getMatchDistance = function() {
return this.dmp.Match_Distance;
};
DTM_DMPMatcher.prototype.setMatchThreshold = function(threshold) {
return this.dmp.Match_Threshold = threshold;
};
DTM_DMPMatcher.prototype.getMatchThreshold = function() {
return this.dmp.Match_Threshold;
};
DTM_DMPMatcher.prototype.getCaseSensitive = function() {
return caseSensitive;
};
DTM_DMPMatcher.prototype.setCaseSensitive = function(value) {
return this.caseSensitive = value;
};
DTM_DMPMatcher.prototype.search = function(text, pattern, expectedStartLoc, options) {
var endIndex, endLen, endLoc, endPos, endSlice, found, matchLen, maxLen, pLen, result, startIndex, startLen, startPos, startSlice;
if (expectedStartLoc == null) {
expectedStartLoc = 0;
}
if (options == null) {
options = {};
}
if (expectedStartLoc < 0) {
throw new Error("Can't search at negative indices!");
}
if (expectedStartLoc !== Math.floor(expectedStartLoc)) {
throw new Error("Expected start location must be an integer.");
}
if (!this.caseSensitive) {
text = text.toLowerCase();
pattern = pattern.toLowerCase();
}
pLen = pattern.length;
maxLen = this.getMaxPatternLength();
if (pLen <= maxLen) {
result = this.searchForSlice(text, pattern, expectedStartLoc);
} else {
startSlice = pattern.substr(0, maxLen);
startPos = this.searchForSlice(text, startSlice, expectedStartLoc);
if (startPos != null) {
startLen = startPos.end - startPos.start;
endSlice = pattern.substr(pLen - maxLen, maxLen);
endLoc = startPos.start + pLen - maxLen;
endPos = this.searchForSlice(text, endSlice, endLoc);
if (endPos != null) {
endLen = endPos.end - endPos.start;
matchLen = endPos.end - startPos.start;
startIndex = startPos.start;
endIndex = endPos.end;
if ((pLen * 0.5 <= matchLen && matchLen <= pLen * 1.5)) {
result = {
start: startIndex,
end: endPos.end
};
}
}
}
}
if (result == null) {
return [];
}
if (options.withLevenhstein || options.withDiff) {
found = text.substr(result.start, result.end - result.start);
result.diff = this.dmp.diff_main(pattern, found);
if (options.withLevenshstein) {
result.lev = this.dmp.diff_levenshtein(result.diff);
}
if (options.withDiff) {
this.dmp.diff_cleanupSemantic(result.diff);
result.diffHTML = this.dmp.diff_prettyHtml(result.diff);
}
}
return [result];
};
DTM_DMPMatcher.prototype.compare = function(text1, text2) {
var result;
if (!((text1 != null) && (text2 != null))) {
throw new Error("Can not compare non-existing strings!");
}
result = {};
result.diff = this.dmp.diff_main(text1, text2);
result.lev = this.dmp.diff_levenshtein(result.diff);
result.errorLevel = result.lev / text1.length;
this.dmp.diff_cleanupSemantic(result.diff);
result.diffHTML = this.dmp.diff_prettyHtml(result.diff);
return result;
};
DTM_DMPMatcher.prototype.searchForSlice = function(text, slice, expectedStartLoc) {
var dneIndex, endIndex, expectedDneLoc, expectedEndLoc, nrettap, r1, r2, result, startIndex, txet;
r1 = this.dmp.match_main(text, slice, expectedStartLoc);
startIndex = r1.index;
if (startIndex === -1) {
return null;
}
txet = this._reverse(text);
nrettap = this._reverse(slice);
expectedEndLoc = startIndex + slice.length;
expectedDneLoc = text.length - expectedEndLoc;
r2 = this.dmp.match_main(txet, nrettap, expectedDneLoc);
dneIndex = r2.index;
endIndex = text.length - dneIndex;
return result = {
start: startIndex,
end: endIndex
};
};
return DTM_DMPMatcher;
})();
}).call(this);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment