A custom metadata vocabulary for documenting low-resource language audio, video, and text datasets in AI/ML contexts, with specialized terms for PII screening, sensitive content handling, participant demographics, and quality metrics.
The Development Data Partnership Vocabulary (DDPV) defines specialized metadata terms to extend standard schemas for documenting datasets in low-resource language AI libraries. This vocabulary was developed to support the Gates Foundation funded initiative for democratizing access to high-quality datasets for low resource language AI model training.
@prefix ddpv: <https://datapartnership.org/ddvp-metadata-terms#> .
Version: 1.0
Date Published: January 15, 2025
License: CC BY 4.0
Status: Testing
DDPV provides terms for:
DDPV extends and works alongside standards such as:
URI: https://datapartnership.org/ddvp-metadata-terms#piiScreening
{
"ddpv:piiScreening": true
}
URI: https://datapartnership.org/ddvp-metadata-terms#piiScreeningMethod
{
"ddpv:piiScreeningMethod": [
"automated_ner",
"manual_review"
]
}
URI: https://datapartnership.org/ddvp-metadata-terms#piiScreeningMethodOther
{
"ddpv:piiScreeningMethod": ["other"],
"ddpv:piiScreeningMethodOther":
"Custom facial recognition masking"
}
URI: https://datapartnership.org/ddvp-metadata-terms#piiNotes
{
"ddpv:piiNotes":
"Participant names mentioned in recordings"
}
URI: https://datapartnership.org/ddvp-metadata-terms#sensitiveContent
{
"ddpv:sensitiveContent": true
}
URI:https://datapartnership.org/ddvp-metadata-terms#sensitiveNotes
{
"ddpv:sensitiveNotes": [
"politically sensitive content",
"religious themes"
]
}
URI: https://datapartnership.org/ddvp-metadata-terms#attribution
{
"ddpv:attribution": true
}
URI: https://datapartnership.org/ddvp-metadata-terms#thirdPartyRestrictions
{
"ddpv:thirdPartyRestrictions":
"Music tracks subject to copyright"
}
URI: https://datapartnership.org/ddvp-metadata-terms#retentionPolicy
{
"ddpv:retentionPolicy":
"Data retained for 5 years"
}
URI: https://datapartnership.org/ddvp-metadata-terms#participants
{
"ddpv:participants": [
{
"olac:role": "speaker",
"olac:code": "SPK001",
"sc:gender": "female",
"ddpv:ageRange": "26-35",
"ddpv:dialectRegion": "Northern"
}
]
}
URI:https://datapartnership.org/ddvp-metadata-terms#ageRange
{
"ddpv:ageRange": "26-35"
}
URI: https://datapartnership.org/ddvp-metadata-terms#dialectRegion
{
"ddpv:dialectRegion": "Central"
}
URI: https://datapartnership.org/ddvp-metadata-terms#equipmentType
{
"ddpv:equipmentType":
"Zoom H6 with Sennheiser MKH 416"
}
URI: https://datapartnership.org/ddvp-metadata-terms#aggregationType
{
"@type": "dqv:QualityMeasurement",
"dqv:isMeasurementOf": "ebucore:duration",
"dqv:value": 145.3,
"schema:unitText": "seconds",
"ddpv:aggregationType": "mean"
}
URI: https://datapartnership.org/ddvp-metadata-terms#NumFiles
{
"@type": "dqv:QualityMeasurement",
"dqv:isMeasurementOf": "ddpv:NumFiles",
"dqv:value": 1250,
"schema:unitText": "count",
"ddpv:aggregationType": "count"
}
The following terms are specific to text datasets and provide metadata for documenting text files and corpus-level quality metrics.
URI: https://datapartnership.org/ddvp-metadata-terms#charCount
{
"ddpv:charCount": 125847
}
URI: https://datapartnership.org/ddvp-metadata-terms#tokenCount
{
"ddpv:tokenCount": 24563
}
URI: https://datapartnership.org/ddvp-metadata-terms#qualityMetrics
{
"ddpv:qualityMetrics": {
"perplexityScore": 142.3,
}
}
These terms are used as values for dqv:isMeasurementOf in dataset-level quality measurements.
URI: https://datapartnership.org/ddvp-metadata-terms#totalTokens
{
"@type": "dqv:QualityMeasurement",
"dqv:isMeasurementOf": "ddpv:totalTokens",
"dqv:value": 12584320,
"schema:unitText": "tokens",
"ddpv:aggregationType": "sum"
}
URI: https://datapartnership.org/ddvp-metadata-terms#vocabularySize
{
"@type": "dqv:QualityMeasurement",
"dqv:isMeasurementOf": "ddpv:vocabularySize",
"dqv:value": 98540,
"schema:unitText": "unique tokens",
"ddpv:aggregationType": "count"
}
URI: https://datapartnership.org/ddvp-metadata-terms#avgCharsPerDoc
{
"@type": "dqv:QualityMeasurement",
"dqv:isMeasurementOf": "ddpv:avgCharsPerDoc",
"dqv:value": 3542.8,
"schema:unitText": "characters",
"ddpv:aggregationType": "mean"
}
URI: https://datapartnership.org/ddvp-metadata-terms#avgTokensPerDoc
{
"@type": "dqv:QualityMeasurement",
"dqv:isMeasurementOf": "ddpv:avgTokensPerDoc",
"dqv:value": 1245.6,
"schema:unitText": "tokens",
"ddpv:aggregationType": "mean"
}
URI: https://datapartnership.org/ddvp-metadata-terms#NER_Coverage
{
"@type": "dqv:QualityMeasurement",
"dqv:isMeasurementOf": "ddpv:NER_Coverage",
"dqv:value": 78.5,
"schema:unitText": "percentage",
"ddpv:aggregationType": "percentage"
}