Commit 97a829ff authored by Sean Hammond's avatar Sean Hammond Committed by GitHub

Merge pull request #495 from hypothesis/search-filter-decaf

Convert search query parser to JS and add tests
parents dc4e7e3b ffe2da0a
# This class will parse the search filter and produce a faceted search filter object
# It expects a search query string where the search term are separated by space character
# and collects them into the given term arrays
module.exports = class SearchFilter
# Splits a search term into filter and data
# i.e.
# 'user:johndoe' -> ['user', 'johndoe']
# 'example:text' -> [null, 'example:text']
_splitTerm: (term) ->
filter = term.slice 0, term.indexOf ":"
unless filter?
# The whole term is data
return [null, term]
if filter in ['group', 'quote', 'result', 'since',
'tag', 'text', 'uri', 'user']
data = term[filter.length+1..]
return [filter, data]
else
# The filter is not a power search filter, so the whole term is data
return [null, term]
# This function will slice the search-text input
# Slice character: space,
# but an expression between quotes (' or ") is considered one
# I.e from the string: "text user:john 'to be or not to be' it will produce:
# ["text", "user:john", "to be or not to be"]
_tokenize: (searchtext) ->
return [] unless searchtext
# Small helper function for removing quote characters
# from the beginning- and end of a string, if the
# quote characters are the same.
# I.e.
# 'foo' -> foo
# "bar" -> bar
# 'foo" -> 'foo"
# bar" -> bar"
_removeQuoteCharacter = (text) ->
start = text.slice 0,1
end = text.slice -1
if (start is '"' or start is "'") and (start == end)
text = text.slice 1, text.length - 1
text
tokens = searchtext.match /(?:[^\s"']+|"[^"]*"|'[^']*')+/g
# Cut the opening and closing quote characters
tokens = tokens.map _removeQuoteCharacter
# Remove quotes for power search.
# I.e. 'tag:"foo bar"' -> 'tag:foo bar'
for token, index in tokens
[filter, data] = @_splitTerm(token)
if filter?
tokens[index] = filter + ':' + (_removeQuoteCharacter data)
tokens
# Turns string query into object, where the properties are the search terms
toObject: (searchtext) ->
obj = {}
filterToBackendFilter = (filter) ->
if filter is 'tag'
'tags'
else
filter
addToObj = (key, data) ->
if obj[key]?
obj[key].push data
else
obj[key] = [data]
if searchtext
terms = @_tokenize(searchtext)
for term in terms
[filter, data] = @_splitTerm(term)
unless filter?
filter = 'any'
data = term
addToObj(filterToBackendFilter(filter), data)
obj
# This function will generate the facets from the search-text input
# It'll first tokenize it and then sorts them into facet lists
# The output will be a dict with the following structure:
# An object with facet_names as keys.
# A value for a key:
# [facet_name]:
# [operator]: 'and'|'or'|'min' (for the elements of the facet terms list)
# [lowercase]: true|false
# [terms]: an array for the matched terms for this facet
# The facet selection is done by analyzing each token.
# It generally expects a <facet_name>:<facet_term> structure for a token
# Where the facet names are: 'quote', 'result', 'since', 'tag', 'text', 'uri', 'user
# Anything that didn't match go to the 'any' facet
# For the 'since' facet the the time string is scanned and is converted to seconds
# So i.e the 'since:7min' token will be converted to 7*60 = 420 for the since facet value
generateFacetedFilter: (searchtext) ->
any = []
quote = []
result = []
since = []
tag = []
text = []
uri = []
user = []
if searchtext
terms = @_tokenize(searchtext)
for term in terms
filter = term.slice 0, term.indexOf ":"
unless filter? then filter = ""
switch filter
when 'quote' then quote.push term[6..]
when 'result' then result.push term[7..]
when 'since'
# We'll turn this into seconds
time = term[6..].toLowerCase()
if time.match /^\d+$/
# Only digits, assuming seconds
since.push time
if time.match /^\d+sec$/
# Time given in seconds
t = /^(\d+)sec$/.exec(time)[1]
since.push t
if time.match /^\d+min$/
# Time given in minutes
t = /^(\d+)min$/.exec(time)[1]
since.push t * 60
if time.match /^\d+hour$/
# Time given in hours
t = /^(\d+)hour$/.exec(time)[1]
since.push t * 60 * 60
if time.match /^\d+day$/
# Time given in days
t = /^(\d+)day$/.exec(time)[1]
since.push t * 60 * 60 * 24
if time.match /^\d+week$/
# Time given in week
t = /^(\d+)week$/.exec(time)[1]
since.push t * 60 * 60 * 24 * 7
if time.match /^\d+month$/
# Time given in month
t = /^(\d+)month$/.exec(time)[1]
since.push t * 60 * 60 * 24 * 30
if time.match /^\d+year$/
# Time given in year
t = /^(\d+)year$/.exec(time)[1]
since.push t * 60 * 60 * 24 * 365
when 'tag' then tag.push term[4..]
when 'text' then text.push term[5..]
when 'uri' then uri.push term[4..]
when 'user' then user.push term[5..]
else any.push term
any:
terms: any
operator: 'and'
quote:
terms: quote
operator: 'and'
result:
terms: result
operator: 'min'
since:
terms: since
operator: 'and'
tag:
terms: tag
operator: 'and'
text:
terms: text
operator: 'and'
uri:
terms: uri
operator: 'or'
user:
terms: user
operator: 'or'
'use strict';
/**
* Splits a search term into filter and data.
*
* ie. 'user:johndoe' -> ['user', 'johndoe']
* 'example:text' -> [null, 'example:text']
*/
function splitTerm(term) {
var filter = term.slice(0, term.indexOf(':'));
if (!filter) {
// The whole term is data
return [null, term];
}
if (['group', 'quote', 'result', 'since',
'tag', 'text', 'uri', 'user'].includes(filter)) {
var data = term.slice(filter.length+1);
return [filter, data];
} else {
// The filter is not a power search filter, so the whole term is data
return [null, term];
}
}
/**
* Tokenize a search query.
*
* Splits `searchtext` into tokens, separated by spaces.
* Quoted phrases in `searchtext` are returned as a single token.
*/
function tokenize(searchtext) {
if (!searchtext) { return []; }
// Small helper function for removing quote characters
// from the beginning- and end of a string, if the
// quote characters are the same.
// I.e.
// 'foo' -> foo
// "bar" -> bar
// 'foo" -> 'foo"
// bar" -> bar"
var _removeQuoteCharacter = function(text) {
var start = text.slice(0,1);
var end = text.slice(-1);
if (((start === '"') || (start === "'")) && (start === end)) {
text = text.slice(1, text.length - 1);
}
return text;
};
var tokens = searchtext.match(/(?:[^\s"']+|"[^"]*"|'[^']*')+/g);
// Cut the opening and closing quote characters
tokens = tokens.map(_removeQuoteCharacter);
// Remove quotes for power search.
// I.e. 'tag:"foo bar"' -> 'tag:foo bar'
for (var index = 0; index < tokens.length; index++) {
var token = tokens[index];
var [filter, data] = splitTerm(token);
if (filter) {
tokens[index] = filter + ':' + (_removeQuoteCharacter(data));
}
}
return tokens;
}
/**
* Parse a search query into a map of search field to term.
*
* @param {string} searchtext
* @return {Object}
*/
function toObject(searchtext) {
var obj = {};
var backendFilter = f => f === 'tag' ? 'tags' : f;
var addToObj = function(key, data) {
if (obj[key]) {
return obj[key].push(data);
} else {
return obj[key] = [data];
}
};
if (searchtext) {
var terms = tokenize(searchtext);
for (var term of terms) {
var [filter, data] = splitTerm(term);
if (!filter) {
filter = 'any';
data = term;
}
addToObj(backendFilter(filter), data);
}
}
return obj;
}
/**
* @typedef Facet
* @property {'and'|'or'|'min'} operator
* @property {boolean} lowercase
* @property {string[]} terms
*/
/**
* Parse a search query into a map of filters.
*
* Returns an object mapping facet names to Facet.
*
* Terms that are not associated with a particular facet are stored in the "any"
* facet.
*
* @param {string} searchtext
* @return {Object}
*/
function generateFacetedFilter(searchtext) {
var terms;
var any = [];
var quote = [];
var result = [];
var since = [];
var tag = [];
var text = [];
var uri = [];
var user = [];
if (searchtext) {
terms = tokenize(searchtext);
for (var term of terms) {
var t;
var filter = term.slice(0, term.indexOf(':'));
switch (filter) {
case 'quote':
quote.push(term.slice(6));
break;
case 'result':
result.push(term.slice(7));
break;
case 'since':
{
// We'll turn this into seconds
let time = term.slice(6).toLowerCase();
if (time.match(/^\d+$/)) {
// Only digits, assuming seconds
since.push(time * 1);
}
if (time.match(/^\d+sec$/)) {
// Time given in seconds
t = /^(\d+)sec$/.exec(time)[1];
since.push(t * 1);
}
if (time.match(/^\d+min$/)) {
// Time given in minutes
t = /^(\d+)min$/.exec(time)[1];
since.push(t * 60);
}
if (time.match(/^\d+hour$/)) {
// Time given in hours
t = /^(\d+)hour$/.exec(time)[1];
since.push(t * 60 * 60);
}
if (time.match(/^\d+day$/)) {
// Time given in days
t = /^(\d+)day$/.exec(time)[1];
since.push(t * 60 * 60 * 24);
}
if (time.match(/^\d+week$/)) {
// Time given in week
t = /^(\d+)week$/.exec(time)[1];
since.push(t * 60 * 60 * 24 * 7);
}
if (time.match(/^\d+month$/)) {
// Time given in month
t = /^(\d+)month$/.exec(time)[1];
since.push(t * 60 * 60 * 24 * 30);
}
if (time.match(/^\d+year$/)) {
// Time given in year
t = /^(\d+)year$/.exec(time)[1];
since.push(t * 60 * 60 * 24 * 365);
}
}
break;
case 'tag': tag.push(term.slice(4)); break;
case 'text': text.push(term.slice(5)); break;
case 'uri': uri.push(term.slice(4)); break;
case 'user': user.push(term.slice(5)); break;
default: any.push(term);
}
}
}
return {
any: {
terms: any,
operator: 'and',
},
quote: {
terms: quote,
operator: 'and',
},
result: {
terms: result,
operator: 'min',
},
since: {
terms: since,
operator: 'and',
},
tag: {
terms: tag,
operator: 'and',
},
text: {
terms: text,
operator: 'and',
},
uri: {
terms: uri,
operator: 'or',
},
user: {
terms: user,
operator: 'or',
},
};
}
// @ngInject
function searchFilter() {
return {
toObject,
generateFacetedFilter,
};
}
module.exports = searchFilter;
{module, inject} = angular.mock
describe 'searchFilter', ->
sandbox = null
searchFilter = null
before ->
angular.module('h', [])
.service('searchFilter', require('../search-filter'))
beforeEach module('h')
beforeEach ->
sandbox = sinon.sandbox.create()
beforeEach inject (_searchFilter_) ->
searchFilter = _searchFilter_
afterEach ->
sandbox.restore()
describe 'toObject', ->
it 'puts a simple search string under the any filter', ->
query = 'foo'
result = searchFilter.toObject(query)
assert.equal(result.any[0], query)
it 'uses the filters as keys in the result object', ->
query = 'user:john text:foo quote:bar group:agroup other'
result = searchFilter.toObject(query)
assert.equal(result.any[0], 'other')
assert.equal(result.user[0], 'john')
assert.equal(result.text[0], 'foo')
assert.equal(result.quote[0], 'bar')
assert.equal(result.group[0], 'agroup')
it 'collects the same filters into a list', ->
query = 'user:john text:foo quote:bar other user:doe text:fuu text:fii'
result = searchFilter.toObject(query)
assert.equal(result.any[0], 'other')
assert.equal(result.user[0], 'john')
assert.equal(result.user[1], 'doe')
assert.equal(result.text[0], 'foo')
assert.equal(result.text[1], 'fuu')
assert.equal(result.text[2], 'fii')
assert.equal(result.quote[0], 'bar')
it 'preserves data with semicolon characters', ->
query = 'uri:http://test.uri'
result = searchFilter.toObject(query)
assert.equal(result.uri[0], 'http://test.uri')
it 'collects valid filters and puts invalid into the any category', ->
query = 'uri:test foo:bar text:hey john:doe quote:according hi-fi a:bc'
result = searchFilter.toObject(query)
assert.isFalse(result.foo?)
assert.isFalse(result.john?)
assert.isFalse(result.a?)
assert.equal(result.uri[0], 'test')
assert.equal(result.text[0], 'hey')
assert.equal(result.quote[0], 'according')
assert.equal(result.any[0], 'foo:bar')
assert.equal(result.any[1], 'john:doe')
assert.equal(result.any[2], 'hi-fi')
assert.equal(result.any[3], 'a:bc')
'use strict';
var searchFilter = require('../search-filter')();
describe('sidebar.search-filter', () => {
describe('#toObject', () => {
it('puts a simple search string under the any filter', () => {
var query = 'foo';
var result = searchFilter.toObject(query);
assert.equal(result.any[0], query);
});
it('uses the filters as keys in the result object', () => {
var query = 'user:john text:foo quote:bar group:agroup other';
var result = searchFilter.toObject(query);
assert.equal(result.any[0], 'other');
assert.equal(result.user[0], 'john');
assert.equal(result.text[0], 'foo');
assert.equal(result.quote[0], 'bar');
assert.equal(result.group[0], 'agroup');
});
it('collects the same filters into a list', () => {
var query = 'user:john text:foo quote:bar other user:doe text:fuu text:fii';
var result = searchFilter.toObject(query);
assert.equal(result.any[0], 'other');
assert.equal(result.user[0], 'john');
assert.equal(result.user[1], 'doe');
assert.equal(result.text[0], 'foo');
assert.equal(result.text[1], 'fuu');
assert.equal(result.text[2], 'fii');
assert.equal(result.quote[0], 'bar');
});
it('preserves data with semicolon characters', () => {
var query = 'uri:http://test.uri';
var result = searchFilter.toObject(query);
assert.equal(result.uri[0], 'http://test.uri');
});
it('collects valid filters and puts invalid into the "any" category', () => {
var query = 'uri:test foo:bar text:hey john:doe quote:according hi-fi a:bc';
var result = searchFilter.toObject(query);
assert.isUndefined(result.foo);
assert.isUndefined(result.john);
assert.isUndefined(result.a);
assert.equal(result.uri[0], 'test');
assert.equal(result.text[0], 'hey');
assert.equal(result.quote[0], 'according');
assert.equal(result.any[0], 'foo:bar');
assert.equal(result.any[1], 'john:doe');
assert.equal(result.any[2], 'hi-fi');
assert.equal(result.any[3], 'a:bc');
});
});
describe('#generateFacetedFilter', () => {
[{
query: 'one two three',
expectedFilter: {
any: {
operator: 'and',
terms: ['one', 'two', 'three'],
},
},
},{
query: 'tag:foo tag:bar',
expectedFilter: {
tag: {
operator: 'and',
terms: ['foo', 'bar'],
},
},
},{
query: 'quote:inthequote text:inthetext',
expectedFilter: {
quote: {
operator: 'and',
terms: ['inthequote'],
},
text: {
operator: 'and',
terms: ['inthetext'],
},
},
},{
query: 'user:john user:james',
expectedFilter: {
user: {
operator: 'or',
terms: ['john', 'james'],
},
},
},{
query: 'uri:https://example.org/article.html',
expectedFilter: {
uri: {
operator: 'or',
terms: ['https://example.org/article.html'],
},
},
}].forEach(({ query, expectedFilter }) => {
it('parses a search query', () => {
var filter = searchFilter.generateFacetedFilter(query);
// Remove empty facets.
Object.keys(filter).forEach((k) => {
if (filter[k].terms.length === 0) {
delete filter[k];
}
});
assert.deepEqual(filter, expectedFilter);
});
});
[{
timeExpr: '8sec',
expectedSecs: 8,
},{
timeExpr: '7min',
expectedSecs: 420,
},{
timeExpr: '7hour',
expectedSecs: 7 * 60 * 60,
},{
timeExpr: '4day',
expectedSecs: 4 * 60 * 60 * 24,
},{
timeExpr: '1week',
expectedSecs: 1 * 60 * 60 * 24 * 7,
},{
timeExpr: '2month',
expectedSecs: 2 * 60 * 60 * 24 * 30,
},{
timeExpr: '2year',
expectedSecs: 2 * 60 * 60 * 24 * 365,
},{
timeExpr: '5wibble',
expectedSecs: null,
}].forEach(({ timeExpr, expectedSecs }) => {
it('parses a "since:" query', () => {
var query = `since:${timeExpr}`;
var filter = searchFilter.generateFacetedFilter(query);
if (expectedSecs === null) {
assert.deepEqual(filter.since.terms, []);
} else {
assert.deepEqual(filter.since.terms, [expectedSecs]);
}
});
});
});
});
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment