build(docs-infra): improve search quality (#25750)

PR Close #25750
This commit is contained in:
Pete Bacon Darwin 2018-09-14 14:08:48 +01:00 committed by Kara Erickson
parent ee9f0b5d9a
commit ec96332559
5 changed files with 41 additions and 27 deletions

View File

@ -27,12 +27,13 @@ self.onmessage = handleMessage;
// Create the lunr index - the docs should be an array of objects, each object containing // Create the lunr index - the docs should be an array of objects, each object containing
// the path and search terms for a page // the path and search terms for a page
function createIndex(addFn) { function createIndex(addFn) {
lunr.QueryLexer.termSeparator = lunr.tokenizer.separator = /\s+/;
return lunr(/** @this */function() { return lunr(/** @this */function() {
this.ref('path'); this.ref('path');
this.field('titleWords', {boost: 100}); this.field('titleWords', {boost: 10});
this.field('headingWords', {boost: 50}); this.field('headingWords', {boost: 5});
this.field('members', {boost: 40}); this.field('members', {boost: 4});
this.field('keywords', {boost: 20}); this.field('keywords', {boost: 2});
addFn(this); addFn(this);
}); });
} }
@ -86,10 +87,13 @@ function loadIndex(searchInfo /*: SearchInfo */) {
function queryIndex(query) { function queryIndex(query) {
try { try {
if (query.length) { if (query.length) {
var results = index.search(query);
if (results.length === 0) {
// Add a relaxed search in the title for the first word in the query // Add a relaxed search in the title for the first word in the query
// E.g. if the search is "ngCont guide" then we search for "ngCont guide titleWords:ngCont*" // E.g. if the search is "ngCont guide" then we search for "ngCont guide titleWords:ngCont*"
var titleQuery = 'titleWords:*' + query.split(' ', 1)[0] + '*'; var titleQuery = 'titleWords:*' + query.split(' ', 1)[0] + '*';
var results = index.search(query + ' ' + titleQuery); results = index.search(query + ' ' + titleQuery);
}
// Map the hits into info about each page to be returned as results // Map the hits into info about each page to be returned as results
return results.map(function(hit) { return pages[hit.ref]; }); return results.map(function(hit) { return pages[hit.ref]; });
} }

View File

@ -12,7 +12,7 @@ describe('site search', () => {
page.enterSearch('ngCont'); page.enterSearch('ngCont');
expect(page.getSearchResults()).toContain('NgControl'); expect(page.getSearchResults()).toContain('NgControl');
page.enterSearch('accessor'); page.enterSearch('valueaccess');
expect(page.getSearchResults()).toContain('ControlValueAccessor'); expect(page.getSearchResults()).toContain('ControlValueAccessor');
}); });

View File

@ -31,9 +31,6 @@ module.exports = function generateKeywordsProcessor(log, readFilesProcessor) {
var propertiesToIgnore; var propertiesToIgnore;
var docTypesToIgnore; var docTypesToIgnore;
// Keywords start with "ng:" or one of $, _ or a letter
var KEYWORD_REGEX = /^((ng:|[$_a-z])[\w\-_]+)/;
// Load up the keywords to ignore, if specified in the config // Load up the keywords to ignore, if specified in the config
if (this.ignoreWordsFile) { if (this.ignoreWordsFile) {
var ignoreWordsPath = path.resolve(readFilesProcessor.basePath, this.ignoreWordsFile); var ignoreWordsPath = path.resolve(readFilesProcessor.basePath, this.ignoreWordsFile);
@ -52,20 +49,33 @@ module.exports = function generateKeywordsProcessor(log, readFilesProcessor) {
// If the heading contains a name starting with ng, e.g. "ngController", then add the // If the heading contains a name starting with ng, e.g. "ngController", then add the
// name without the ng to the text, e.g. "controller". // name without the ng to the text, e.g. "controller".
function preprocessText(text) { function tokenize(text) {
return text.replace(/(^|\s)([nN]g([A-Z]\w*))/g, '$1$2 $3'); const rawTokens = text.split(/[\s\/]+/mg);
const tokens = [];
rawTokens.forEach(token => {
// Strip off unwanted trivial characters
token = token
.trim()
.replace(/^[_\-"'`({[<$*)}\]>.]+/, '')
.replace(/[_\-"'`({[<$*)}\]>.]+$/, '');
// Ignore tokens that contain weird characters
if (/^[\w.\-]+$/.test(token)) {
tokens.push(token.toLowerCase());
const ngTokenMatch = /^[nN]g([A-Z]\w*)/.exec(token);
if (ngTokenMatch) {
tokens.push(ngTokenMatch[1].toLowerCase());
}
}
});
return tokens;
} }
function extractWords(text, words, keywordMap) { function extractWords(text, words, keywordMap) {
var tokens = preprocessText(text).toLowerCase().split(/[.\s,`'"#]+/mg); var tokens = tokenize(text);
tokens.forEach(function(token) { tokens.forEach(function(token) {
var match = token.match(KEYWORD_REGEX); if (!keywordMap[token]) {
if (match) { words.push(token);
var key = match[1]; keywordMap[token] = true;
if (!keywordMap[key]) {
keywordMap[key] = true;
words.push(key);
}
} }
}); });
} }
@ -116,7 +126,7 @@ module.exports = function generateKeywordsProcessor(log, readFilesProcessor) {
// Attach all this search data to the document // Attach all this search data to the document
doc.searchTerms = { doc.searchTerms = {
titleWords: preprocessText(doc.searchTitle), titleWords: tokenize(doc.searchTitle).join(' '),
headingWords: headingWords.sort().join(' '), headingWords: headingWords.sort().join(' '),
keywords: words.sort().join(' '), keywords: words.sort().join(' '),
members: members.sort().join(' ') members: members.sort().join(' ')

View File

@ -80,7 +80,7 @@ describe('generateKeywords processor', () => {
]; ];
processor.$process(docs); processor.$process(docs);
const keywordsDoc = docs[docs.length - 1]; const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data[0].titleWords).toEqual('class PublicExport'); expect(keywordsDoc.data[0].titleWords).toEqual('class publicexport');
}); });
it('should add heading words to the search terms', () => { it('should add heading words to the search terms', () => {
@ -141,7 +141,7 @@ describe('generateKeywords processor', () => {
]; ];
processor.$process(docs); processor.$process(docs);
const keywordsDoc = docs[docs.length - 1]; const keywordsDoc = docs[docs.length - 1];
expect(keywordsDoc.data[0].titleWords).toEqual('ngController Controller'); expect(keywordsDoc.data[0].titleWords).toEqual('ngcontroller controller');
expect(keywordsDoc.data[0].headingWords).toEqual('model ngmodel'); expect(keywordsDoc.data[0].headingWords).toEqual('model ngmodel');
expect(keywordsDoc.data[0].keywords).toContain('class'); expect(keywordsDoc.data[0].keywords).toContain('class');
expect(keywordsDoc.data[0].keywords).toContain('ngclass'); expect(keywordsDoc.data[0].keywords).toContain('ngclass');
@ -163,7 +163,7 @@ describe('generateKeywords processor', () => {
[{ [{
'title':'SomeClass', 'title':'SomeClass',
'type':'class', 'type':'class',
'titleWords':'SomeClass', 'titleWords':'someclass',
'headingWords':'heading some someclass', 'headingWords':'heading some someclass',
'keywords':'api class documentation for is someclass the', 'keywords':'api class documentation for is someclass the',
'members':'' 'members':''

View File

@ -37,7 +37,7 @@ module.exports = new Package('angular.io', [gitPackage, apiPackage, contentPacka
checkAnchorLinksProcessor.$runBefore = ['convertToJsonProcessor']; checkAnchorLinksProcessor.$runBefore = ['convertToJsonProcessor'];
checkAnchorLinksProcessor.$runAfter = ['fixInternalDocumentLinks']; checkAnchorLinksProcessor.$runAfter = ['fixInternalDocumentLinks'];
// We only want to check docs that are going to be output as JSON docs. // We only want to check docs that are going to be output as JSON docs.
checkAnchorLinksProcessor.checkDoc = (doc) => doc.path && doc.outputPath && extname(doc.outputPath) === '.json'; checkAnchorLinksProcessor.checkDoc = (doc) => doc.path && doc.outputPath && extname(doc.outputPath) === '.json' && doc.docType !== 'json-doc';
// Since we have a `base[href="/"]` arrangement all links are relative to that and not relative to the source document's path // Since we have a `base[href="/"]` arrangement all links are relative to that and not relative to the source document's path
checkAnchorLinksProcessor.base = '/'; checkAnchorLinksProcessor.base = '/';
// Ignore links to local assets // Ignore links to local assets