Part 10: OCR
4 minute read
In this example, you will ingest the Nuix logo below into your case and perform Optical Character Recognition (OCR) on it. Right click on the image below in your browser to save the image to disk as nuix.png.
Prerequisites
- Install the OCR plugin from https://download.nuix.com/releases/addons.
- Verify that the OCR plugin is installed by calling the
resources/ocr
endpoint.
curl --location --request GET 'http://localhost:8080/nuix-restful-service/svc/v1/resources/ocr' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
{
"success": true
}
File Ingestion of Image
Ingest your image with text.
curl --location --request POST 'http://localhost:8080/nuix-restful-service/svc/v1/cases/43b070164ce8453ca30ed9e2dfcce67b/evidence/file' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d' \
--header 'Content-Type: application/json' \
--header 'Accept: application/json' \
--data-raw '{
"processorSettings": {
"storeBinary": true
},
"target": {
"path": "/Images/nuix.png"
}
}
'
{
"functionKey": "f599c065-523e-4dd5-8ab1-82cad97afe8b",
"location": "http://localhost:8080/nuix-restful-service/svc/v1/asyncFunctions/f599c065-523e-4dd5-8ab1-82cad97afe8b"
}
File Ingestion Status
The functionKey
field returned from the ingestion endpoint above can be polled for status.
curl --location --request GET 'http://localhost:8080/nuix-restful-service/svc/v1/asyncFunctions/f599c065-523e-4dd5-8ab1-82cad97afe8b' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
{
"done": true,
"cancelled": false,
"result": true,
"token": "a3b10cf2-1c75-4507-86dd-865eb56555b7",
"functionKey": "f599c065-523e-4dd5-8ab1-82cad97afe8b",
"progress": 7011,
"total": 0,
"percentComplete": null,
"updatedOn": 1613506664487,
"status": null,
"statusId": null,
"requestTime": 1613506653553,
"startTime": 1613506653553,
"finishTime": 1613506666353,
"caseId": "43b070164ce8453ca30ed9e2dfcce67b",
"caseName": "HelloWorld",
"hasSuccessfullyCompleted": true,
"friendlyName": "Evidence Ingestion Function",
"caseLocation": "/Cases/HelloWorld",
"requestor": "nuixadmin",
"action": "AsyncBulkIngestionFunction",
"options": {
"reloadQuery": null,
"processorSettings": {
"processText": null,
"processLooseFileContents": null,
"processForensicImages": null,
"analysisLanguage": null,
"stopWords": null,
"stemming": null,
"enableExactQueries": null,
"extractNamedEntities": null,
"extractNamedEntitiesFromText": null,
"extractNamedEntitiesFromProperties": null,
"extractNamedEntitiesFromTextStripped": null,
"extractShingles": null,
"processTextSummaries": null,
"calculateSSDeepFuzzyHash": null,
"detectFaces": null,
"extractFromSlackSpace": null,
"carveFileSystemUnallocatedSpace": null,
"carveUnidentifiedData": null,
"carvingBlockSize": null,
"recoverDeletedFiles": null,
"extractEndOfFileSlackSpace": null,
"smartProcessRegistry": null,
"identifyPhysicalFiles": null,
"createThumbnails": null,
"skinToneAnalysis": null,
"calculateAuditedSize": null,
"storeBinary": true,
"maxStoredBinarySize": null,
"maxDigestSize": null,
"digests": [],
"addBccToEmailDigests": null,
"addCommunicationDateToEmailDigests": null,
"reuseEvidenceStores": null,
"processFamilyFields": null,
"hideEmbeddedImmaterialData": null,
"reportProcessingStatus": null,
"workerItemCallback": null,
"workerItemCallbacks": null
},
"evidence": [
{
"guid": null,
"name": null,
"customMetadata": null,
"encoding": null,
"custodian": null,
"timeZone": null,
"description": null,
"locale": null,
"files": [
{
"path": "/Images/nuix.png"
}
],
"exchangeMailboxes": null,
"s3Buckets": null,
"sqlServers": null,
"enterpriseVaults": null,
"sharepointSites": null,
"mailStores": null,
"loadFiles": null,
"centeraClusters": null,
"splitFiles": null,
"dropboxes": null,
"sshServers": null
}
],
"localWorkerCount": 1,
"repositories": [],
"parallelProcessingSettings": {
"workerCount": null,
"workerMemory": null,
"workerTemp": null,
"brokerMemory": null,
"workerBrokerAddress": null,
"useRemoteWorkers": false,
"embedBroker": true
},
"rescanEvidenceRepositories": false,
"loadProcessingJob": {
"casePath": "/Cases/HelloWorld",
"jobGuid": "55480a21-720c-4f3e-baa6-c7c5b53b28b0",
"processingMode": "Load",
"startDate": 1613506654064,
"workerCount": 1,
"finished": true,
"paused": false,
"masterAddress": "1.1.1.1",
"bytesProcessed": 7011,
"itemsProcessed": 1,
"jobSizeTotalBytes": 0
}
},
"participatingInCaseFunctionQueue": true,
"processedBy": "nuix-restful-server-1",
"errorMsg": null
}
Item Verification
Verify that you have the item (nuix.png
) in your case by doing a search. You can also use the search to retrieve the guid
of the item for OCR.
curl --location --request GET 'http://localhost:8080/nuix-restful-service/svc/v2/cases/43b070164ce8453ca30ed9e2dfcce67b/search?query=nuix.png&metadataProfile=Default&numberOfRecordsRequested=100' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
{
"request": {
"caseId": "43b070164ce8453ca30ed9e2dfcce67b",
"query": "nuix.png",
"sortField": null,
"sortOrder": null,
"startIndex": 0,
"numberOfRecordsRequested": 1,
"deduplicate": null,
"metadataProfile": "Default",
"fieldList": null,
"customMetadataList": null,
"propertyList": null,
"itemParameterizedFields": null,
"showAvailableThumbnails": false,
"useCache": false,
"forceCacheDelete": false,
"searchRetry": 0,
"relationType": null,
"entities": [],
"s": 0,
"p": 1,
"customMetadataField": null,
"field": null,
"property": null
},
"startedOn": 1613507186542,
"completedOn": 1613507186598,
"elapsedTimeForSearch": 50,
"elapsedTimeForSort": 0,
"elapsedTimeForMarshal": 1,
"elapsedTimeForDeduplicate": 0,
"elapsedTotal": 56,
"metadataItems": [
"Name",
"File Type",
"Path Name"
],
"localizedMetadataItems": [
"Name",
"File Type",
"Path Name"
],
"metadataItemDetails": [
{
"name": "Name",
"localisedName": "Name",
"type": "String"
},
{
"name": "File Type",
"localisedName": "File Type",
"type": "String"
},
{
"name": "Path Name",
"localisedName": "Path Name",
"type": "String"
}
],
"resultList": [
{
"File Type": "Portable Network Graphic",
"Name": "nuix.png",
"Path Name": "/e1746668-8eca-48b2-a8bd-14f17c59c1e1",
"guid": "519ca60c-a397-4d1b-a5aa-547287a8ad1a"
}
],
"count": 1,
"deduplicatedCount": 1
}
OCR
You can now OCR the item using the guid
returned from your search query.
519ca60c-a397-4d1b-a5aa-547287a8ad1a
curl --location --request PUT 'http://localhost:8080/nuix-restful-service/svc/v1/cases/43b070164ce8453ca30ed9e2dfcce67b/items/ocr' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
--header 'Content-Type: application/json' \
--header 'Accept: application/json' \
--data-raw '{
"parallelProcessingSettings": {
"embedBroker": true,
"workerCount": 4,
"workerMemory": 2048
},
"query": "guid:519ca60c-a397-4d1b-a5aa-547287a8ad1a"
}'
{
"functionKey": "f534cd08-c374-4c7a-acf4-32ec613adbff",
"location": "http://localhost:8080/nuix-restful-service/svc/v1/asyncFunctions/f534cd08-c374-4c7a-acf4-32ec613adbff"
}
OCR Status
The functionKey
field returned from the OCR endpoint above can be polled for status.
curl --location --request GET 'http://localhost:8080/nuix-restful-service/svc/v1/asyncFunctions/f534cd08-c374-4c7a-acf4-32ec613adbff' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
{
"done": true,
"cancelled": false,
"result": {
"total": 1,
"failure": 0
},
"token": "a3b10cf2-1c75-4507-86dd-865eb56555b7",
"functionKey": "f534cd08-c374-4c7a-acf4-32ec613adbff",
"progress": 1,
"total": 1,
"percentComplete": 100.0000,
"updatedOn": 1613507447309,
"status": null,
"statusId": null,
"requestTime": 1613507437697,
"startTime": 1613507437698,
"finishTime": 1613507449524,
"caseId": "43b070164ce8453ca30ed9e2dfcce67b",
"caseName": "HelloWorld",
"hasSuccessfullyCompleted": true,
"friendlyName": "OCR Function",
"caseLocation": "/Cases/HelloWorld",
"requestor": "username",
"action": "AsyncOcrFunction",
"options": {
"ocrOptions": {
"regeneratePdfs": null,
"updatePdf": null,
"updateText": null,
"textModification": null,
"quality": null,
"rotation": null,
"deskew": null,
"clearOcrCache": null,
"outputDirectory": null,
"languages": [
"ENGLISH"
],
"timeout": null,
"updateDuplicates": null,
"ocrProfileName": null
},
"localWorkerCount": 1,
"parallelProcessingSettings": {
"workerCount": 4,
"workerMemory": 2048,
"workerTemp": null,
"brokerMemory": null,
"workerBrokerAddress": null,
"useRemoteWorkers": false,
"embedBroker": true
},
"query": "guid:519ca60c-a397-4d1b-a5aa-547287a8ad1a",
"ocrProfile": null,
"ocrImagingSettings": null,
"exportProcessingJob": {
"casePath": "/Cases/HelloWorld",
"jobGuid": "145e16b9-7800-4240-b108-a2e8ecc58724",
"processingMode": "Export",
"startDate": 1613507437889,
"workerCount": 9,
"finished": true,
"paused": false,
"masterAddress": "1.1.1.1",
"currentStage": "TEXT_REPLACEMENT",
"currentStageDuration": "0 seconds",
"currentStageExportedItemsCount": 1,
"failedItemCount": 0,
"totalItemCount": 1
},
"imagingProfile": null,
"tags": null
},
"participatingInCaseFunctionQueue": true,
"processedBy": "nuix-restful-server-1",
"errorMsg": null
}
Item Text
Finally, you can retrieve the item text of the image using the itemText
endpoint.
curl --location --request GET 'http://localhost:8080/svc/v1/cases/43b070164ce8453ca30ed9e2dfcce67b/items/519ca60c-a397-4d1b-a5aa-547287a8ad1a/itemText' \
--header 'nuix-auth-token: 9729a460-eda7-48dc-ba70-d12b3aae3c8d'
{
"text": "nuix",
"binaryAvailable": true,
"htmlEscape": false,
"totalTextLength": 4,
"blank": false
}
The text
field in the response is nuix
and we have successfully performed an OCR on this image.
Feedback
Was this page helpful?
Thank you for your feedback.
Thank you for your feedback.