Skip to content

Commit c3ce64c

Browse files
authored
Merge pull request #214 from pelias/avoid-transliterated-names
skip names we suspect were sourced from machine transliteration
2 parents 5430492 + 624a397 commit c3ce64c

File tree

1 file changed

+21
-6
lines changed

1 file changed

+21
-6
lines changed

prototype/wof.js

+21-6
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ const util = require('util');
66
const blacklist = require('pelias-blacklist-stream/loader')();
77
const analysis = require('../lib/analysis');
88
const language = dir('../config/language');
9+
const LOW_POPULATION_THRESHOLD = 2000;
910

1011
// list of languages / tags we favour in cases of deduplication
1112
const LANG_PREFS = ['eng','und'];
@@ -97,12 +98,25 @@ function insertWofRecord( wof, next ){
9798
}
9899
}
99100

100-
// add 'name:*'
101-
for( var attr in wof ){
102-
// https://github.com/whosonfirst/whosonfirst-names
103-
// names: preferred|colloquial|variant|unknown
104-
var match = attr.match(/^name:([a-z]{3})_x_(preferred|colloquial|variant)$/);
105-
if( match ){
101+
// note: skip all `name:*` fields when we suspect that they were sourced from
102+
// machine transliteration via WikiData.
103+
// see: https://github.com/whosonfirst-data/whosonfirst-data/issues/799
104+
const hasDeadOrObscureLanguages = _.has(wof, 'name:vol_x_preferred');
105+
const isLowOrUnknownPopulation = _.get(doc, 'population', 0) < LOW_POPULATION_THRESHOLD;
106+
const isMegaCity = _.get(doc, 'wof:megacity', 0) === 1;
107+
const isCapitalCity = !_.isEmpty(_.get(doc, 'wof:capital_of'));
108+
const isLikelyTransliterated = (
109+
hasDeadOrObscureLanguages && isLowOrUnknownPopulation && !isMegaCity && !isCapitalCity
110+
);
111+
if (!isLikelyTransliterated) {
112+
113+
// add 'name:*' fields
114+
for( var attr in wof ){
115+
// https://github.com/whosonfirst/whosonfirst-names
116+
// names: preferred|colloquial|variant|unknown
117+
const match = attr.match(/^name:([a-z]{3})_x_(preferred|colloquial|variant)$/);
118+
if (!match) { continue; }
119+
106120
// Fix for https://github.com/pelias/placeholder/pull/126
107121
// Transform iso codes 639-2/B to 639-2/T
108122
const lang = language.alternatives[match[1]] || match[1];
@@ -127,6 +141,7 @@ function insertWofRecord( wof, next ){
127141
doc.names[ lang ] = wof[ attr ];
128142
}
129143
}
144+
130145
}
131146
}
132147

0 commit comments

Comments
 (0)