diff --git a/Gemfile b/Gemfile index 688dac4..6b219ca 100644 --- a/Gemfile +++ b/Gemfile @@ -2,3 +2,6 @@ source 'https://rubygems.org' gem 'sequel' gem 'mysql2' +gem 'sparql' +gem 'nokogiri' +gem 'pry' diff --git a/Gemfile.lock b/Gemfile.lock index f29c834..a04da3f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,24 +1,111 @@ GEM remote: https://rubygems.org/ specs: + base64 (0.3.0) + bcp47_spec (0.2.1) bigdecimal (4.0.1) + builder (3.3.0) + coderay (1.1.3) + connection_pool (3.0.2) + ebnf (2.6.0) + base64 (~> 0.2) + htmlentities (~> 4.3) + rdf (~> 3.3) + scanf (~> 1.0) + sxp (~> 2.0) + unicode-types (~> 1.8) + htmlentities (4.4.2) + io-console (0.8.2) + link_header (0.0.8) + logger (1.7.0) + matrix (0.4.3) + method_source (1.1.0) mysql2 (0.5.7) bigdecimal + net-http-persistent (4.0.8) + connection_pool (>= 2.2.4, < 4) + nokogiri (1.19.0-x86_64-linux-gnu) + racc (~> 1.4) + pry (0.16.0) + coderay (~> 1.1) + method_source (~> 1.0) + reline (>= 0.6.0) + racc (1.8.1) + rdf (3.3.1) + bcp47_spec (~> 0.2) + link_header (~> 0.0, >= 0.0.8) + rdf-aggregate-repo (3.3.0) + rdf (~> 3.3) + rdf-xsd (3.3.0) + rdf (~> 3.3) + rexml (~> 3.2) + readline (0.0.4) + reline + reline (0.6.3) + io-console (~> 0.5) + rexml (3.4.4) + scanf (1.0.0) sequel (5.101.0) bigdecimal + sparql (3.3.2) + builder (~> 3.2, >= 3.2.4) + ebnf (~> 2.5) + logger (~> 1.5) + rdf (~> 3.3) + rdf-aggregate-repo (~> 3.3) + rdf-xsd (~> 3.3) + readline (~> 0.0) + sparql-client (~> 3.3) + sxp (~> 2.0) + sparql-client (3.3.0) + net-http-persistent (~> 4.0, >= 4.0.2) + rdf (~> 3.3) + sxp (2.0.0) + matrix (~> 0.4) + rdf (~> 3.3) + unicode-types (1.11.0) PLATFORMS - ruby x86_64-linux DEPENDENCIES mysql2 + nokogiri + pry sequel + sparql CHECKSUMS + base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b + bcp47_spec (0.2.1) sha256=3fd62edf96c126bd9624e4319ac74082a966081859d1ee0ef3c3041640a37810 bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7 + builder (3.3.0) sha256=497918d2f9dca528fdca4b88d84e4ef4387256d984b8154e9d5d3fe5a9c8835f + coderay (1.1.3) sha256=dc530018a4684512f8f38143cd2a096c9f02a1fc2459edcfe534787a7fc77d4b + connection_pool (3.0.2) sha256=33fff5ba71a12d2aa26cb72b1db8bba2a1a01823559fb01d29eb74c286e62e0a + ebnf (2.6.0) sha256=e746a316caa885cc45e243dc33efc194943956760bc9bc13948de1732fbcf63e + htmlentities (4.4.2) sha256=bbafbdf69f2eca9262be4efef7e43e6a1de54c95eb600f26984f71d2fe96c5c3 + io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc + link_header (0.0.8) sha256=15c65ce43b29f739b30d05e5f25c22c23797e89cf6f905dbb595fb4c70cb55f9 + logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203 + matrix (0.4.3) sha256=a0d5ab7ddcc1973ff690ab361b67f359acbb16958d1dc072b8b956a286564c5b + method_source (1.1.0) sha256=181301c9c45b731b4769bc81e8860e72f9161ad7d66dd99103c9ab84f560f5c5 mysql2 (0.5.7) sha256=ba09ede515a0ae8a7192040a1b778c0fb0f025fa5877e9be895cd325fa5e9d7b + net-http-persistent (4.0.8) sha256=ef3de8319d691537b329053fae3a33195f8b070bbbfae8bf1a58c796081960e6 + nokogiri (1.19.0-x86_64-linux-gnu) sha256=f482b95c713d60031d48c44ce14562f8d2ce31e3a9e8dd0ccb131e9e5a68b58c + pry (0.16.0) sha256=d76c69065698ed1f85e717bd33d7942c38a50868f6b0673c636192b3d1b6054e + racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f + rdf (3.3.1) sha256=dda6a2c95198915fc63d66ee270e35d4a76d431720747a2cf97ecd92062fa150 + rdf-aggregate-repo (3.3.0) sha256=5693ccabf4dbbec7113c95e9aab028311f19d6022764fdebc6327f9d55a9efdc + rdf-xsd (3.3.0) sha256=fab51d27b20344237d9b622ef32e83e4c44940840bfc76a245ce6b6abba44772 + readline (0.0.4) sha256=6138eef17be2b98298b672c3ea63bf9cb5158d401324f26e1e84f235879c1d6a + reline (0.6.3) sha256=1198b04973565b36ec0f11542ab3f5cfeeec34823f4e54cebde90968092b1835 + rexml (3.4.4) sha256=19e0a2c3425dfbf2d4fc1189747bdb2f849b6c5e74180401b15734bc97b5d142 + scanf (1.0.0) sha256=533db7f7e5acafea1a145d6c5329cef667a58fbcb7d64379a808ff1199ee1b00 sequel (5.101.0) sha256=d2ae3fd997a7c4572e8357918e777869faf90dc19310fcd6332747122aed2b29 + sparql (3.3.2) sha256=20d73a62801fd6d03c834ca5012c11aaf6594ef536f554e92fd94bf9b3ed64dc + sparql-client (3.3.0) sha256=71225eefad48dc2baab6b7008df8a9bcfffa833e5f25387dbe87ff52a5cad64e + sxp (2.0.0) sha256=79971bbab54a82fe4a861332475eb8c1f33142d70f2b7e830dacbd9082824721 + unicode-types (1.11.0) sha256=81d1201273260fa89b85471e7eebb93a51bb4e5f078a525508dcae7835d176f9 BUNDLED WITH 4.0.3 diff --git a/data/000-found_countries_many.ttl b/data/000-found_countries_many.ttl new file mode 100644 index 0000000..e69de29 diff --git a/data/000-found_countries_one.ttl b/data/000-found_countries_one.ttl new file mode 100644 index 0000000..8e78803 --- /dev/null +++ b/data/000-found_countries_one.ttl @@ -0,0 +1,183 @@ +country:Argentina a mig:Country ; + rdfs:label "Argentina"@en ; + mig:wikidataID wd:Q414 . +country:Austria a mig:Country ; + rdfs:label "Austria"@en ; + mig:wikidataID wd:Q40 . +country:Australia a mig:Country ; + rdfs:label "Australia"@en ; + mig:wikidataID wd:Q408 . +country:Azerbaijan a mig:Country ; + rdfs:label "Azerbaijan"@en ; + mig:wikidataID wd:Q227 . +country:Belgium a mig:Country ; + rdfs:label "Belgium"@en ; + mig:wikidataID wd:Q31 . +country:Belarus a mig:Country ; + rdfs:label "Belarus"@en ; + mig:wikidataID wd:Q184 . +country:Bulgaria a mig:Country ; + rdfs:label "Bulgaria"@en ; + mig:wikidataID wd:Q219 . +country:Brazil a mig:Country ; + rdfs:label "Brazil"@en ; + mig:wikidataID wd:Q155 . +country:Canada a mig:Country ; + rdfs:label "Canada"@en ; + mig:wikidataID wd:Q16 . +country:Switzerland a mig:Country ; + rdfs:label "Switzerland"@en ; + mig:wikidataID wd:Q39 . +country:Chile a mig:Country ; + rdfs:label "Chile"@en ; + mig:wikidataID wd:Q298 . +country:Ukraine a mig:Country ; + rdfs:label "Ukraine"@en ; + mig:wikidataID wd:Q212 . +country:Costa_Rica a mig:Country ; + rdfs:label "Costa Rica"@en ; + mig:wikidataID wd:Q800 . +country:Cuba a mig:Country ; + rdfs:label "Cuba"@en ; + mig:wikidataID wd:Q241 . +country:Czech_Republic a mig:Country ; + rdfs:label "Czech Republic"@en ; + mig:wikidataID wd:Q213 . +country:Denmark a mig:Country ; + rdfs:label "Denmark"@en ; + mig:wikidataID wd:Q35 . +country:Algeria a mig:Country ; + rdfs:label "Algeria"@en ; + mig:wikidataID wd:Q262 . +country:Egypt a mig:Country ; + rdfs:label "Egypt"@en ; + mig:wikidataID wd:Q79 . +country:El_Salvador a mig:Country ; + rdfs:label "El Salvador"@en ; + mig:wikidataID wd:Q792 . +country:Spain a mig:Country ; + rdfs:label "Spain"@en ; + mig:wikidataID wd:Q29 . +country:Fiji a mig:Country ; + rdfs:label "Fiji"@en ; + mig:wikidataID wd:Q712 . +country:France a mig:Country ; + rdfs:label "France"@en ; + mig:wikidataID wd:Q142 . +country:United_Kingdom a mig:Country ; + rdfs:label "United Kingdom"@en ; + mig:wikidataID wd:Q145 . +country:Germany a mig:Country ; + rdfs:label "Germany"@en ; + mig:wikidataID wd:Q183 . +country:Greece a mig:Country ; + rdfs:label "Greece"@en ; + mig:wikidataID wd:Q41 . +country:Croatia a mig:Country ; + rdfs:label "Croatia"@en ; + mig:wikidataID wd:Q224 . +country:Hungary a mig:Country ; + rdfs:label "Hungary"@en ; + mig:wikidataID wd:Q28 . +country:Slovakia a mig:Country ; + rdfs:label "Slovakia"@en ; + mig:wikidataID wd:Q214 . +country:Indonesia a mig:Country ; + rdfs:label "Indonesia"@en ; + mig:wikidataID wd:Q252 . +country:Ireland a mig:Country ; + rdfs:label "Ireland"@en ; + mig:wikidataID wd:Q27 . +country:India a mig:Country ; + rdfs:label "India"@en ; + mig:wikidataID wd:Q668 . +country:Israel a mig:Country ; + rdfs:label "Israel"@en ; + mig:wikidataID wd:Q801 . +country:Italy a mig:Country ; + rdfs:label "Italy"@en ; + mig:wikidataID wd:Q38 . +country:Japan a mig:Country ; + rdfs:label "Japan"@en ; + mig:wikidataID wd:Q17 . +country:Latvia a mig:Country ; + rdfs:label "Latvia"@en ; + mig:wikidataID wd:Q211 . +country:Lithuania a mig:Country ; + rdfs:label "Lithuania"@en ; + mig:wikidataID wd:Q37 . +country:Sri_Lanka a mig:Country ; + rdfs:label "Sri Lanka"@en ; + mig:wikidataID wd:Q854 . +country:Monaco a mig:Country ; + rdfs:label "Monaco"@en ; + mig:wikidataID wd:Q235 . +country:Myanmar a mig:Country ; + rdfs:label "Myanmar"@en ; + mig:wikidataID wd:Q836 . +country:Moldova a mig:Country ; + rdfs:label "Moldova"@en ; + mig:wikidataID wd:Q217 . +country:Mexico a mig:Country ; + rdfs:label "Mexico"@en ; + mig:wikidataID wd:Q96 . +country:Netherlands a mig:Country ; + rdfs:label "Netherlands"@en ; + mig:wikidataID wd:Q55 . +country:Norway a mig:Country ; + rdfs:label "Norway"@en ; + mig:wikidataID wd:Q20 . +country:New_Zealand a mig:Country ; + rdfs:label "New Zealand"@en ; + mig:wikidataID wd:Q664 . +country:Panama a mig:Country ; + rdfs:label "Panama"@en ; + mig:wikidataID wd:Q804 . +country:Peru a mig:Country ; + rdfs:label "Peru"@en ; + mig:wikidataID wd:Q419 . +country:Poland a mig:Country ; + rdfs:label "Poland"@en ; + mig:wikidataID wd:Q36 . +country:Paraguay a mig:Country ; + rdfs:label "Paraguay"@en ; + mig:wikidataID wd:Q733 . +country:Palestine a mig:Country ; + rdfs:label "Palestine"@en ; + mig:wikidataID wd:Q219060 . +country:Portugal a mig:Country ; + rdfs:label "Portugal"@en ; + mig:wikidataID wd:Q45 . +country:Romania a mig:Country ; + rdfs:label "Romania"@en ; + mig:wikidataID wd:Q218 . +country:Russia a mig:Country ; + rdfs:label "Russia"@en ; + mig:wikidataID wd:Q159 . +country:Sweden a mig:Country ; + rdfs:label "Sweden"@en ; + mig:wikidataID wd:Q34 . +country:Singapore a mig:Country ; + rdfs:label "Singapore"@en ; + mig:wikidataID wd:Q334 . +country:Slovenia a mig:Country ; + rdfs:label "Slovenia"@en ; + mig:wikidataID wd:Q215 . +country:Serbia a mig:Country ; + rdfs:label "Serbia"@en ; + mig:wikidataID wd:Q403 . +country:Turkey a mig:Country ; + rdfs:label "Turkey"@en ; + mig:wikidataID wd:Q43 . +country:United_States a mig:Country ; + rdfs:label "United States"@en ; + mig:wikidataID wd:Q30 . +country:Uruguay a mig:Country ; + rdfs:label "Uruguay"@en ; + mig:wikidataID wd:Q77 . +country:Vietnam a mig:Country ; + rdfs:label "Vietnam"@en ; + mig:wikidataID wd:Q881 . +country:South_Africa a mig:Country ; + rdfs:label "South Africa"@en ; + mig:wikidataID wd:Q258 . diff --git a/data/000-found_countries_zero.ttl b/data/000-found_countries_zero.ttl new file mode 100644 index 0000000..92def20 --- /dev/null +++ b/data/000-found_countries_zero.ttl @@ -0,0 +1,24 @@ +Channel Islands +China +Columbia +Eastern Europe +England +Estland +Europe +Northern Ireland (UK) +ITaly +Korea + +USA +North America +Poland / Ukraine +Prussia +Scandinavia +Scotland +Soviet Union +England (UK) +Wales (UK) +USa +UAS +West Europe +Yugoslavia diff --git a/src/000-found-countries.rb b/src/000-found-countries.rb new file mode 100644 index 0000000..846b4ba --- /dev/null +++ b/src/000-found-countries.rb @@ -0,0 +1,53 @@ +#!/usr/bin/env ruby + +require_relative 'database' +require_relative 'vocabularies' + +# Output files of this step +found_one = File.open(File.join('data', '000-found_countries_one.ttl'), 'w') +found_zero = File.open(File.join('data', '000-found_countries_zero.ttl'), 'w') +found_many = File.open(File.join('data', '000-found_countries_many.ttl'), 'w') +files = [found_one, found_zero, found_many] + +files.each do |file| + file.puts prefixes(:wd, :wdt, :rdfs) +end + +countries = {} + +DB[:location].each do |location| + unless location[:Country].nil? or location[:Country].empty? or countries.include? location[:Country] + countries[location[:Country]] = { + id: "region:#{location[:country]}" + } + end +end + +def country_definition(country_name_literal, wd_solutions) + wd_ids = wd_solutions.map do |solution| + country_uri = solution[:country].to_s + get_wd_name(country_uri) + end + "country:#{toName(country_name_literal.to_s)} a mig:Country ;\n" \ + " rdfs:label #{country_name_literal.to_ntriples} ;\n" \ + " mig:wikidataID #{wd_ids.join(' , ')} .\n" +end + +countries.each do |country_name, country_attrs| + p country_name + country_name_literal = RDF::Literal.new(country_name, language: :en) + query = wikidata.select + .where([:country, RDFS.label, country_name_literal]) + .where([:country, WDT.P31, WD.Q6256]) + solutions = query.solutions + case solutions.size + when 0 + found_zero << "#{country_name}\n" + when 1 + found_one << country_definition(country_name_literal, solutions) + else + found_many << country_definition(country_name_literal, solutions) + end + sleep(1) +end + diff --git a/src/database.rb b/src/database.rb new file mode 100644 index 0000000..6662c5d --- /dev/null +++ b/src/database.rb @@ -0,0 +1,26 @@ +require 'sequel' +require 'sparql/client' +require 'rdf' + +# Database connectors + +DB = Sequel.mysql2(host: 'localhost', user: 'migrants', database: 'migrants', password: '1234') + +wikidata = SPARQL::Client.new('https://query.wikidata.org/sparql') + +# Vocabularies +wd = RDF::Vocabulary.new('http://www.wikidata.org/entity/') +wdt = RDF::Vocabulary.new('http://www.wikidata.org/prop/direct/') +rdfs = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#') + + +# Common functions + +def toName(name) + name.gsub(' ', '_') +end + +def get_wd_name(uri) + uri.to_s.gsub('http://www.wikidata.org/entity/', 'wd:') +end + diff --git a/src/map_countries.rb b/src/map_countries.rb new file mode 100755 index 0000000..24130ce --- /dev/null +++ b/src/map_countries.rb @@ -0,0 +1,65 @@ +#!/usr/bin/env ruby + +require 'sparql/client' +require 'rdf' +require 'pry' + +found_one = File.open('found_countries_one.ttl', 'w') +found_zero = File.open('found_countries_zero.ttl', 'w') +found_many = File.open('found_countries_many.ttl', 'w') + +wikidata = SPARQL::Client.new('https://query.wikidata.org/sparql') + +# Vocabularies +wd = RDF::Vocabulary.new('http://www.wikidata.org/entity/') +wdt = RDF::Vocabulary.new('http://www.wikidata.org/prop/direct/') +rdfs = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#') + +# binding.pry + +# query = wikidata.select.where([:s, :p, :o]).limit(10) + +# query.each_solution do |solution| +# p solution +# end + +require_relative 'database' + +COUNTRIES = {} + +DB[:location].each do |location| + unless location[:Country].nil? or COUNTRIES.include? location[:Country] + COUNTRIES[location[:Country]] = { + id: "region:#{location[:country]}" + } + end +end + +def country_definition(country_name_literal, wd_solutions) + wd_ids = wd_solutions.map do |solution| + country_uri = solution[:country].to_s + get_wd_name(country_uri) + end + "country:#{toName(country_name_literal.to_s)} a mig:Country ;\n" \ + " rdfs:label #{country_name_literal.to_ntriples} ;\n" \ + " mig:wikidataID #{wd_ids.join(' , ')} .\n" +end + +COUNTRIES.each do |country_name, country_attrs| + p country_name + country_name_literal = RDF::Literal.new(country_name, language: :en) + query = wikidata.select + .where([:country, rdfs.label, country_name_literal]) + .where([:country, wdt.P31, wd.Q6256]) + solutions = query.solutions + case solutions.size + when 0 + found_zero << "#{country_name}\n" + when 1 + found_one << country_definition(country_name_literal, solutions) + else + found_many << country_definition(country_name_literal, solutions) + end + sleep(1) +end + diff --git a/src/map_locations.rb b/src/map_locations.rb index 9a71b32..33ded75 100755 --- a/src/map_locations.rb +++ b/src/map_locations.rb @@ -1,12 +1,6 @@ #!/usr/bin/env ruby -require 'sequel' - -DB = Sequel.mysql2(host: 'localhost', user: 'migrants', database: 'migrants', password: '1234') - -def toName(name) - name.gsub(' ', '_') -end +require_relative 'database' # Define the regions diff --git a/src/vocabularies.rb b/src/vocabularies.rb new file mode 100644 index 0000000..0220d0d --- /dev/null +++ b/src/vocabularies.rb @@ -0,0 +1,13 @@ +# Vocabularies + +WD = RDF::Vocabulary.new('http://www.wikidata.org/entity/') +WDT = RDF::Vocabulary.new('http://www.wikidata.org/prop/direct/') +RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#') + +PREFIXES = { wd: WD, wdt: WDT, rdfs: RDFS } + +def prefixes(*namespaces) + namespaces.map do |ns| + "@prefix #{ns}: <#{PREFIXES[ns]}> .\n" + end.join +end