diff --git a/data/countries.ttl b/data/countries.ttl new file mode 100644 index 0000000..1219d38 --- /dev/null +++ b/data/countries.ttl @@ -0,0 +1,250 @@ +@prefix country: . +@prefix rdfs: . +@prefix schema: . + +country:Algeria a schema:Country ; + rdfs:label "Algeria"@en . + +country:Argentina a schema:Country ; + rdfs:label "Argentina"@en . + +country:Australia a schema:Country ; + rdfs:label "Australia"@en . + +country:Austria a schema:Country ; + rdfs:label "Austria"@en . + +country:Azerbaijan a schema:Country ; + rdfs:label "Azerbaijan"@en . + +country:Belarus a schema:Country ; + rdfs:label "Belarus"@en . + +country:Belgium a schema:Country ; + rdfs:label "Belgium"@en . + +country:Brazil a schema:Country ; + rdfs:label "Brazil"@en . + +country:Bulgaria a schema:Country ; + rdfs:label "Bulgaria"@en . + +country:Canada a schema:Country ; + rdfs:label "Canada"@en . + +country:Channel_Islands a schema:Country ; + rdfs:label "Channel Islands"@en . + +country:Chile a schema:Country ; + rdfs:label "Chile"@en . + +country:China a schema:Country ; + rdfs:label "China"@en . + +country:Columbia a schema:Country ; + rdfs:label "Columbia"@en . + +country:Costa_Rica a schema:Country ; + rdfs:label "Costa Rica"@en . + +country:Croatia a schema:Country ; + rdfs:label "Croatia"@en . + +country:Cuba a schema:Country ; + rdfs:label "Cuba"@en . + +country:Czech_Republic a schema:Country ; + rdfs:label "Czech Republic"@en . + +country:Denmark a schema:Country ; + rdfs:label "Denmark"@en . + +country:Eastern_Europe a schema:Country ; + rdfs:label "Eastern Europe"@en . + +country:Egypt a schema:Country ; + rdfs:label "Egypt"@en . + +country:El_Salvador a schema:Country ; + rdfs:label "El Salvador"@en . + +country:England a schema:Country ; + rdfs:label "England"@en . + +country:England_UK a schema:Country ; + rdfs:label "England (UK)"@en . + +country:Estland a schema:Country ; + rdfs:label "Estland"@en . + +country:Europe a schema:Country ; + rdfs:label "Europe"@en . + +country:Fiji a schema:Country ; + rdfs:label "Fiji"@en . + +country:France a schema:Country ; + rdfs:label "France"@en . + +country:Germany a schema:Country ; + rdfs:label "Germany"@en . + +country:Greece a schema:Country ; + rdfs:label "Greece"@en . + +country:Hungary a schema:Country ; + rdfs:label "Hungary"@en . + +country:India a schema:Country ; + rdfs:label "India"@en . + +country:Indonesia a schema:Country ; + rdfs:label "Indonesia"@en . + +country:Ireland a schema:Country ; + rdfs:label "Ireland"@en . + +country:Israel a schema:Country ; + rdfs:label "Israel"@en . + +country:Italy a schema:Country ; + rdfs:label "Italy"@en . + +country:Japan a schema:Country ; + rdfs:label "Japan"@en . + +country:Korea a schema:Country ; + rdfs:label "Korea"@en . + +country:Latvia a schema:Country ; + rdfs:label "Latvia"@en . + +country:Lithuania a schema:Country ; + rdfs:label "Lithuania"@en . + +country:Mexico a schema:Country ; + rdfs:label "Mexico"@en . + +country:Moldova a schema:Country ; + rdfs:label "Moldova"@en . + +country:Monaco a schema:Country ; + rdfs:label "Monaco"@en . + +country:Myanmar a schema:Country ; + rdfs:label "Myanmar"@en . + +country:Netherlands a schema:Country ; + rdfs:label "Netherlands"@en . + +country:New_Zealand a schema:Country ; + rdfs:label "New Zealand"@en . + +country:North_America a schema:Country ; + rdfs:label "North America"@en . + +country:Northern_Ireland_UK a schema:Country ; + rdfs:label "Northern Ireland (UK)"@en . + +country:Norway a schema:Country ; + rdfs:label "Norway"@en . + +country:Palestine a schema:Country ; + rdfs:label "Palestine"@en . + +country:Panama a schema:Country ; + rdfs:label "Panama"@en . + +country:Paraguay a schema:Country ; + rdfs:label "Paraguay"@en . + +country:Peru a schema:Country ; + rdfs:label "Peru"@en . + +country:Poland a schema:Country ; + rdfs:label "Poland"@en . + +country:Poland_Ukraine a schema:Country ; + rdfs:label "Poland / Ukraine"@en . + +country:Portugal a schema:Country ; + rdfs:label "Portugal"@en . + +country:Prussia a schema:Country ; + rdfs:label "Prussia"@en . + +country:Romania a schema:Country ; + rdfs:label "Romania"@en . + +country:Russia a schema:Country ; + rdfs:label "Russia"@en . + +country:Scandinavia a schema:Country ; + rdfs:label "Scandinavia"@en . + +country:Scotland a schema:Country ; + rdfs:label "Scotland"@en . + +country:Serbia a schema:Country ; + rdfs:label "Serbia"@en . + +country:Singapore a schema:Country ; + rdfs:label "Singapore"@en . + +country:Slovakia a schema:Country ; + rdfs:label "Slovakia"@en . + +country:Slovenia a schema:Country ; + rdfs:label "Slovenia"@en . + +country:South_Africa a schema:Country ; + rdfs:label "South Africa"@en . + +country:Soviet_Union a schema:Country ; + rdfs:label "Soviet Union"@en . + +country:Spain a schema:Country ; + rdfs:label "Spain"@en . + +country:Sri_Lanka a schema:Country ; + rdfs:label "Sri Lanka"@en . + +country:Sweden a schema:Country ; + rdfs:label "Sweden"@en . + +country:Switzerland a schema:Country ; + rdfs:label "Switzerland"@en . + +country:Turkey a schema:Country ; + rdfs:label "Turkey"@en . + +country:UAS a schema:Country ; + rdfs:label "UAS"@en . + +country:Ukraine a schema:Country ; + rdfs:label "Ukraine"@en . + +country:United_Kingdom a schema:Country ; + rdfs:label "United Kingdom"@en . + +country:United_States a schema:Country ; + rdfs:label "United States"@en . + +country:Uruguay a schema:Country ; + rdfs:label "Uruguay"@en . + +country:USA a schema:Country ; + rdfs:label "USA"@en . + +country:Vietnam a schema:Country ; + rdfs:label "Vietnam"@en . + +country:Wales_UK a schema:Country ; + rdfs:label "Wales (UK)"@en . + +country:West_Europe a schema:Country ; + rdfs:label "West Europe"@en . + +country:Yugoslavia a schema:Country ; + rdfs:label "Yugoslavia"@en . + diff --git a/data/countries_wikidata.ttl b/data/countries_wikidata.ttl new file mode 100644 index 0000000..74e16af --- /dev/null +++ b/data/countries_wikidata.ttl @@ -0,0 +1,191 @@ +@prefix country: . +@prefix geo: . +@prefix rdfs: . +@prefix schema: . +@prefix wd: . +@prefix wdt: . + + +country:Algeria schema:sameAs wd:Q262 ; + schema:sameAs geo:2589581 . + +country:Argentina schema:sameAs wd:Q414 ; + schema:sameAs geo:3865483 . + +country:Australia schema:sameAs wd:Q408 ; + schema:sameAs geo:2077456 . + +country:Austria schema:sameAs wd:Q40 ; + schema:sameAs geo:2782113 . + +country:Azerbaijan schema:sameAs wd:Q227 ; + schema:sameAs geo:587116 . + +country:Belarus schema:sameAs wd:Q184 ; + schema:sameAs geo:630336 . + +country:Belgium schema:sameAs wd:Q31 ; + schema:sameAs geo:2802361 . + +country:Brazil schema:sameAs wd:Q155 ; + schema:sameAs geo:3469034 . + +country:Bulgaria schema:sameAs wd:Q219 ; + schema:sameAs geo:732800 . + +country:Canada schema:sameAs wd:Q16 ; + schema:sameAs geo:6251999 . + +country:Chile schema:sameAs wd:Q298 ; + schema:sameAs geo:3895114 . + +country:Costa_Rica schema:sameAs wd:Q800 ; + schema:sameAs geo:3624060 . + +country:Croatia schema:sameAs wd:Q224 ; + schema:sameAs geo:3202326 . + +country:Cuba schema:sameAs wd:Q241 ; + schema:sameAs geo:3562981 . + +country:Czech_Republic schema:sameAs wd:Q213 ; + schema:sameAs geo:3077311 . + +country:Denmark schema:sameAs wd:Q35 ; + schema:sameAs geo:2623032 . + +country:Egypt schema:sameAs wd:Q79 ; + schema:sameAs geo:357994 . + +country:El_Salvador schema:sameAs wd:Q792 ; + schema:sameAs geo:3585968 . + +country:Fiji schema:sameAs wd:Q712 ; + schema:sameAs geo:2205218 . + +country:France schema:sameAs wd:Q142 ; + schema:sameAs geo:3017382 . + +country:Germany schema:sameAs wd:Q183 ; + schema:sameAs geo:2921044 . + +country:Greece schema:sameAs wd:Q41 ; + schema:sameAs geo:390903 . + +country:Hungary schema:sameAs wd:Q28 ; + schema:sameAs geo:719819 . + +country:India schema:sameAs wd:Q668 ; + schema:sameAs geo:1269750 . + +country:Indonesia schema:sameAs wd:Q252 ; + schema:sameAs geo:1643084 . + +country:Ireland schema:sameAs wd:Q27 ; + schema:sameAs geo:2963597 . + +country:Israel schema:sameAs wd:Q801 ; + schema:sameAs geo:294640 . + +country:Italy schema:sameAs wd:Q38 ; + schema:sameAs geo:3175395 . + +country:Japan schema:sameAs wd:Q17 ; + schema:sameAs geo:1861060 . + +country:Latvia schema:sameAs wd:Q211 ; + schema:sameAs geo:458258 . + +country:Lithuania schema:sameAs wd:Q37 ; + schema:sameAs geo:597427 . + +country:Mexico schema:sameAs wd:Q96 ; + schema:sameAs geo:3996063 . + +country:Moldova schema:sameAs wd:Q217 ; + schema:sameAs geo:617790 . + +country:Monaco schema:sameAs wd:Q235 ; + schema:sameAs geo:2993457 . + +country:Myanmar schema:sameAs wd:Q836 ; + schema:sameAs geo:1327865 . + +country:Netherlands schema:sameAs wd:Q55 ; + schema:sameAs geo:2750405 . + +country:New_Zealand schema:sameAs wd:Q664 ; + schema:sameAs geo:2186224 . + +country:Norway schema:sameAs wd:Q20 ; + schema:sameAs geo:3144096 . + +country:Palestine schema:sameAs wd:Q219060 ; + schema:sameAs geo:6254930 . + +country:Panama schema:sameAs wd:Q804 ; + schema:sameAs geo:3703430 . + +country:Paraguay schema:sameAs wd:Q733 ; + schema:sameAs geo:3437598 . + +country:Peru schema:sameAs wd:Q419 ; + schema:sameAs geo:3932488 . + +country:Poland schema:sameAs wd:Q36 ; + schema:sameAs geo:798544 . + +country:Portugal schema:sameAs wd:Q45 ; + schema:sameAs geo:2264397 . + +country:Romania schema:sameAs wd:Q218 ; + schema:sameAs geo:798549 . + +country:Russia schema:sameAs wd:Q159 ; + schema:sameAs geo:2017370 . + +country:Serbia schema:sameAs wd:Q403 ; + schema:sameAs geo:6290252 . + +country:Singapore schema:sameAs wd:Q334 ; + schema:sameAs geo:1880251 . + +country:Slovakia schema:sameAs wd:Q214 ; + schema:sameAs geo:3057568 . + +country:Slovenia schema:sameAs wd:Q215 ; + schema:sameAs geo:3190538 . + +country:South_Africa schema:sameAs wd:Q258 ; + schema:sameAs geo:953987 . + +country:Spain schema:sameAs wd:Q29 ; + schema:sameAs geo:2510769 . + +country:Sri_Lanka schema:sameAs wd:Q854 ; + schema:sameAs geo:1227603 . + +country:Sweden schema:sameAs wd:Q34 ; + schema:sameAs geo:2661886 . + +country:Switzerland schema:sameAs wd:Q39 ; + schema:sameAs geo:2658434 . + +country:Turkey schema:sameAs wd:Q43 ; + schema:sameAs geo:298795 . + +country:Ukraine schema:sameAs wd:Q212 ; + schema:sameAs geo:690791 . + +country:United_Kingdom schema:sameAs wd:Q145 ; + schema:sameAs geo:2635167 . + +country:United_States schema:sameAs wd:Q30 ; + schema:sameAs geo:6252001 . + +country:Uruguay schema:sameAs wd:Q77 ; + schema:sameAs geo:3439705 . + +country:Vietnam schema:sameAs wd:Q881 ; + schema:sameAs geo:1562822 . + diff --git a/data/countries_wikidata_review.ttl b/data/countries_wikidata_review.ttl new file mode 100644 index 0000000..29c1113 --- /dev/null +++ b/data/countries_wikidata_review.ttl @@ -0,0 +1,71 @@ +@prefix country: . +@prefix geo: . +@prefix rdfs: . +@prefix schema: . +@prefix wd: . +@prefix wdt: . + + +# No Wikidata entity found for Channel Islands +country:Channel_Islands rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for China +country:China rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Columbia +country:Columbia rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Eastern Europe +country:Eastern_Europe rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for England +country:England rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for England (UK) +country:England_UK rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Estland +country:Estland rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Europe +country:Europe rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Korea +country:Korea rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for North America +country:North_America rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Northern Ireland (UK) +country:Northern_Ireland_UK rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Poland / Ukraine +country:Poland_Ukraine rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Prussia +country:Prussia rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Scandinavia +country:Scandinavia rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Scotland +country:Scotland rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Soviet Union +country:Soviet_Union rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for UAS +country:UAS rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for USA +country:USA rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Wales (UK) +country:Wales_UK rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for West Europe +country:West_Europe rdfs:comment "No Wikidata entity found" . + +# No Wikidata entity found for Yugoslavia +country:Yugoslavia rdfs:comment "No Wikidata entity found" . + diff --git a/src/database.rb b/src/database.rb index 36ee316..db44197 100644 --- a/src/database.rb +++ b/src/database.rb @@ -17,7 +17,7 @@ rdfs = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#') # Common functions def toName(name) - name.gsub(' ', '_') + name.gsub(/[^a-zA-Z0-9_-]/, '_').gsub(/_+/, '_').gsub(/\A_+|_+\z/, '') end def get_wd_name(uri) diff --git a/src/link_countries_to_wikidata.rb b/src/link_countries_to_wikidata.rb new file mode 100644 index 0000000..e2daa85 --- /dev/null +++ b/src/link_countries_to_wikidata.rb @@ -0,0 +1,58 @@ +#!/usr/bin/env ruby + +require 'rdf' +require 'rdf/turtle' +require_relative 'database' +require_relative 'vocabularies' +require_relative 'migrants' + +graph = RDF::Graph.load(File.join('data', 'countries.ttl')) + +output = File.open(File.join('data', 'countries_wikidata.ttl'), 'w') +review = File.open(File.join('data', 'countries_wikidata_review.ttl'), 'w') + +ttl_prefixes = "#{prefixes(:wd, :wdt, :geo, :rdfs, :schema, :country)}\n\n" + +output << ttl_prefixes +review << ttl_prefixes + +graph.query([nil, RDF.type, SCHEMA.Country]) do |stmt| + subject = stmt.subject + label_stmt = graph.query([subject, RDFS.label, nil]).first + next unless label_stmt + + label = label_stmt.object + local_name = subject.to_s.sub(COUNTRY.to_s, '') + + solutions = WIKIDATA.query(<<~SPARQL) + SELECT DISTINCT ?country ?geoNamesID WHERE { + ?country rdfs:label #{label.to_ntriples} . + ?country wdt:P31/wdt:P279* wd:Q6256 . + ?country wdt:P1566 ?geoNamesID . + } + SPARQL + + case solutions.size + when 0 + review.puts "# No Wikidata entity found for #{label}" + review.puts "country:#{local_name} rdfs:comment \"No Wikidata entity found\" ." + review.puts + when 1 + sol = solutions.first + wd_id = get_wd_name(sol[:country].to_s) + output.puts "country:#{local_name} schema:sameAs #{wd_id} ;" + output.puts " schema:sameAs geo:#{sol[:geoNamesID]} ." + output.puts + else + review.puts "# Multiple Wikidata entities found for #{label} — remove all but one:" + solutions.each do |sol| + wd_id = get_wd_name(sol[:country].to_s) + review.puts "country:#{local_name} schema:sameAs #{wd_id} ;" + review.puts " schema:sameAs geo:#{sol[:geoNamesID]} ." + end + review.puts + end +end + +output.close +review.close diff --git a/src/map_countries_from_location.rb b/src/map_countries_from_location.rb new file mode 100644 index 0000000..9870d73 --- /dev/null +++ b/src/map_countries_from_location.rb @@ -0,0 +1,21 @@ +#!/usr/bin/env ruby + +require_relative 'database' +require_relative 'vocabularies' +require_relative 'migrants' + +output = File.open(File.join('data', 'countries.ttl'), 'w') + +output.puts prefixes(:rdfs, :schema, :country) +output.puts + +countries = DB[:location].distinct.select(:Country).where(Sequel.~(Country: nil)).where(Sequel.~(Country: '')).order(:Country) + +countries.each do |row| + name = row[:Country] + output.puts "country:#{toName(name)} a schema:Country ;" + output.puts " rdfs:label #{RDF::Literal.new(name, language: :en).to_ntriples} ." + output.puts +end + +output.close diff --git a/src/vocabularies.rb b/src/vocabularies.rb index 40e0d3a..e60ad44 100644 --- a/src/vocabularies.rb +++ b/src/vocabularies.rb @@ -3,6 +3,7 @@ # External vocabularies WD = RDF::Vocabulary.new('http://www.wikidata.org/entity/') WDT = RDF::Vocabulary.new('http://www.wikidata.org/prop/direct/') +GEO = RDF::Vocabulary.new('http://sws.geonames.org/') RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#') SCHEMA = RDF::Vocabulary.new('https://schema.org/') SKOS = RDF::Vocabulary.new('http://www.w3.org/2004/02/skos/core#') @@ -27,6 +28,7 @@ WORK = RDF::Vocabulary.new('https://daniel.degu.cl/data/theater-migrants/works.t PREFIXES = { wd: WD, wdt: WDT, + geo: GEO, rdfs: RDFS, schema: SCHEMA, country: COUNTRY, @@ -37,7 +39,7 @@ PREFIXES = { occupation: OCCUPATION, organisation: ORGANISATION, personname: PERSONNAME, - personOccupation: PROFESSION, + personOccupation: PERSON_OCCUPATION, region: REGION, relationship: RELATIONSHIP, religion: RELIGION,