Commit 73b1652d authored by Robert Knight's avatar Robert Knight

Make search client paging more robust

There is currently an issue in h where the search API call may sometimes return
pages that are incomplete, even if there are more pages of results to fetch
afterwards [1]. A non-full page caused the client to stop fetching more pages as
it incorrectly assumed it had reached the end. Ultimately this needs to be
resolved in the API [2] by adding an explicit next-page link. Until that is done,
change the client to continue paging through results as long as the current page
has at least one entry (needed to construct the cursor) and we expect more
results based on the `total` value in the response.

Fixes https://github.com/hypothesis/client/issues/5219

[1] https://github.com/hypothesis/h/issues/7840
[2] https://github.com/hypothesis/h/issues/7841
parent c12eb7dd
...@@ -82,8 +82,16 @@ export class SearchClient extends TinyEmitter { ...@@ -82,8 +82,16 @@ export class SearchClient extends TinyEmitter {
private _getPageSize: (pageIndex: number) => number; private _getPageSize: (pageIndex: number) => number;
private _incremental: boolean; private _incremental: boolean;
private _maxResults: number | null; private _maxResults: number | null;
private _resultCount: null | number;
/** Total number of results we expect to receive, from the `total` field. */
private _expectedCount: null | number;
/** Total number of results we have received. */
private _fetchedCount: number;
/** Results received so far, if we are not emitting results incrementally. */
private _results: Annotation[]; private _results: Annotation[];
private _searchFn: (query: SearchQuery) => Promise<SearchResponse>; private _searchFn: (query: SearchQuery) => Promise<SearchResponse>;
private _separateReplies: boolean; private _separateReplies: boolean;
private _sortBy: SortBy; private _sortBy: SortBy;
...@@ -113,8 +121,9 @@ export class SearchClient extends TinyEmitter { ...@@ -113,8 +121,9 @@ export class SearchClient extends TinyEmitter {
this._sortOrder = sortOrder; this._sortOrder = sortOrder;
this._canceled = false; this._canceled = false;
this._expectedCount = null;
this._fetchedCount = 0;
this._results = []; this._results = [];
this._resultCount = null;
} }
/** /**
...@@ -147,10 +156,10 @@ export class SearchClient extends TinyEmitter { ...@@ -147,10 +156,10 @@ export class SearchClient extends TinyEmitter {
return; return;
} }
if (this._resultCount === null) { if (this._expectedCount === null) {
// Emit the result count (total) on first encountering it // Emit the result count (total) on first encountering it
this._resultCount = results.total; this._expectedCount = results.total;
this.emit('resultCount', this._resultCount); this.emit('resultCount', this._expectedCount);
} }
// For now, abort loading of annotations if `maxResults` is set and the // For now, abort loading of annotations if `maxResults` is set and the
...@@ -176,16 +185,29 @@ export class SearchClient extends TinyEmitter { ...@@ -176,16 +185,29 @@ export class SearchClient extends TinyEmitter {
} else { } else {
this._results = this._results.concat(page); this._results = this._results.concat(page);
} }
this._fetchedCount += page.length;
// If the current page was full, there might be additional pages available. // Check if we're expecting more results. If the `total` value from the
const nextPageAvailable = page.length === pageSize; // API is "small" it will be exact. If large, it may be a lower bound, in
// which case we'll keep fetching until we get no more. Calculating this
// is just an optimization to reduce the number of search API calls and
// make the loading state disappear sooner.
//
// See also https://www.elastic.co/guide/en/elasticsearch/reference/7.17/search-your-data.html#track-total-hits.
const expectMore =
this._fetchedCount < this._expectedCount || this._expectedCount > 1000;
// Get the cursor for the start of the next page. This is the last // Get the cursor for the start of the next page.
// value for whatever field results are sorted by from the current page. //
const nextSearchAfter = // Ideally this would be part of the API response (see
page.length > 0 ? page[page.length - 1][this._sortBy] : null; // https://github.com/hypothesis/h/issues/7841), but since it isn't yet,
// we have to construct this ourselves.
let nextSearchAfter = null;
if (page.length > 0 && expectMore) {
nextSearchAfter = page[page.length - 1][this._sortBy];
}
if (nextPageAvailable && nextSearchAfter) { if (nextSearchAfter) {
this._getPage(query, nextSearchAfter, pageIndex + 1); this._getPage(query, nextSearchAfter, pageIndex + 1);
} else { } else {
if (!this._incremental) { if (!this._incremental) {
...@@ -213,8 +235,9 @@ export class SearchClient extends TinyEmitter { ...@@ -213,8 +235,9 @@ export class SearchClient extends TinyEmitter {
* Emits an 'end' event once the search completes. * Emits an 'end' event once the search completes.
*/ */
get(query: SearchQuery) { get(query: SearchQuery) {
this._expectedCount = null;
this._fetchedCount = 0;
this._results = []; this._results = [];
this._resultCount = null;
this._getPage(query); this._getPage(query);
} }
......
...@@ -14,6 +14,7 @@ const RESULTS = [ ...@@ -14,6 +14,7 @@ const RESULTS = [
{ id: 'two', created: '2020-01-02', updated: '2020-02-03' }, { id: 'two', created: '2020-01-02', updated: '2020-02-03' },
{ id: 'three', created: '2020-01-03', updated: '2020-02-02' }, { id: 'three', created: '2020-01-03', updated: '2020-02-02' },
{ id: 'four', created: '2020-01-04', updated: '2020-02-01' }, { id: 'four', created: '2020-01-04', updated: '2020-02-01' },
{ id: 'five', created: '2020-01-05', updated: '2020-01-31' },
]; ];
/** /**
...@@ -264,12 +265,12 @@ describe('SearchClient', () => { ...@@ -264,12 +265,12 @@ describe('SearchClient', () => {
{ {
sortBy: 'updated', sortBy: 'updated',
sortOrder: 'asc', sortOrder: 'asc',
expectedSearchAfter: [undefined, '2020-02-02', '2020-02-04'], expectedSearchAfter: [undefined, '2020-02-01', '2020-02-03'],
}, },
{ {
sortBy: 'created', sortBy: 'created',
sortOrder: 'desc', sortOrder: 'desc',
expectedSearchAfter: [undefined, '2020-01-03', '2020-01-01'], expectedSearchAfter: [undefined, '2020-01-04', '2020-01-02'],
}, },
].forEach(({ sortBy, sortOrder, expectedSearchAfter }) => { ].forEach(({ sortBy, sortOrder, expectedSearchAfter }) => {
it('sets correct "search_after" query parameter depending on `sortBy` and `sortOrder`', async () => { it('sets correct "search_after" query parameter depending on `sortBy` and `sortOrder`', async () => {
...@@ -305,4 +306,40 @@ describe('SearchClient', () => { ...@@ -305,4 +306,40 @@ describe('SearchClient', () => {
const pageIndexes = getPageSize.getCalls().map(call => call.args[0]); const pageIndexes = getPageSize.getCalls().map(call => call.args[0]);
assert.deepEqual(pageIndexes, [0, 1, 2]); assert.deepEqual(pageIndexes, [0, 1, 2]);
}); });
// See https://github.com/hypothesis/client/issues/5219. Once
// https://github.com/hypothesis/h/issues/7841 is completed this test and the
// next one will become obsolete.
it('fetches remaining pages if a page contains fewer annotations than requested', async () => {
// Wrap search function to return fewer results in a page than requested.
const fetchPage = args => fakeSearchFn({ ...args, limit: 1 });
const client = new SearchClient(fetchPage, {
getPageSize: () => 2,
});
const onResults = sinon.stub();
client.on('results', onResults);
client.get({ uri: 'http://example.com' });
await awaitEvent(client, 'end');
for (let i = 0; i < RESULTS.length; i++) {
assert.calledWith(onResults, RESULTS.slice(i, i + 1));
}
});
it('continues fetching until result page is empty if expected result count is large', async () => {
const fetchPage = sinon.spy(async args => {
const result = await fakeSearchFn(args);
result.total = 3000;
return result;
});
const client = new SearchClient(fetchPage, {
getPageSize: () => RESULTS.length,
});
client.get({ uri: 'http://example.com' });
await awaitEvent(client, 'end');
assert.calledTwice(fetchPage);
});
}); });
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment