Skip to content

Commit 801d292

Browse files
committed
feat: add place extraction
- still misses a lot of places
1 parent 40526a8 commit 801d292

File tree

3 files changed

+35
-5
lines changed

3 files changed

+35
-5
lines changed

package-lock.json

Lines changed: 6 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "intel-api-server",
3-
"version": "0.4.4",
3+
"version": "0.5.0",
44
"description": "An implementation of the Dabbu Intel API that helps in extraction of and one pagers/summaries of all knowledge around a certain topic, person or place based on information/data from multiple providers",
55
"main": "src/server.js",
66
"scripts": {
@@ -23,11 +23,12 @@
2323
},
2424
"homepage": "https://github.com/dabbu-knowledge-platform/intel-api-server#readme",
2525
"dependencies": {
26+
"country-in-text-detector": "^1.0.10",
2627
"express": "^4.17.1",
2728
"fs-extra": "^9.1.0",
29+
"lda": "0.2.0",
2830
"multer": "^1.4.2",
29-
"office-text-extractor": "^1.0.2",
30-
"lda": "0.2.0"
31+
"office-text-extractor": "^1.0.2"
3132
},
3233
"devDependencies": {
3334
"pkg": "^4.4.9",

src/routes/extract_info.js

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ const multer = require('multer')
2626
const extractText = require('office-text-extractor')
2727
// The library used to extract keywords
2828
const lda = require('lda')
29-
3029
// Helper function to return keywords in an object format
3130
const extractTopics = (sentences) => {
3231
// Run the LDA library on it (10 clusters of five terms each)
@@ -60,6 +59,8 @@ const extractTopics = (sentences) => {
6059
// Return the final array of keywords
6160
return finalResult
6261
}
62+
// The library used to detect names of a country
63+
const detectPlace = require('country-in-text-detector').detect
6364

6465
// Files library, used to do all file operations across platforms
6566
const fs = require('fs-extra')
@@ -172,6 +173,27 @@ async function extractEmails(name, text) {
172173
return matches
173174
}
174175

176+
// Extract places from text
177+
// FLAW: Does NOT work for all places
178+
async function extractPlaces(name, text) {
179+
// Check if the text is non null
180+
if (!text) return null
181+
let matches = detectPlace(text)
182+
183+
// Convert it to an array of objects and add the file name
184+
matches = matches.map((match) => {
185+
return {
186+
place: match.name,
187+
type: match.type,
188+
country: match.iso3166,
189+
file: name,
190+
}
191+
})
192+
193+
// Return sucessfully
194+
return matches
195+
}
196+
175197
// Put together the above functions and run them for each file
176198
async function processFiles(files) {
177199
// Check if there are any files
@@ -180,6 +202,7 @@ async function processFiles(files) {
180202
let results = {
181203
topics: [],
182204
people: [],
205+
places: [],
183206
}
184207
for (const file of files) {
185208
// Extract the text from the file
@@ -189,6 +212,7 @@ async function processFiles(files) {
189212
...(await extractCommonWords(file.name, text))
190213
)
191214
results.people.push(...(await extractEmails(file.name, text)))
215+
results.places.push(...(await extractPlaces(file.name, text)))
192216
}
193217

194218
// Return successfully

0 commit comments

Comments
 (0)