Created
January 17, 2023 09:20
-
-
Save jesusbagpuss/a8cc8c5328aa6e33e068609bc6f3d6ca to your computer and use it in GitHub Desktop.
Check all EPrints, and all documents, and set language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
### the 'use lib' line assumes the scipt is saved into [EPRINTS_ROOT]/bin/local/ | |
### If the script is saved in another location, please update that line accordingly. | |
use FindBin; | |
use lib "$FindBin::Bin/../../perl_lib"; | |
use EPrints; | |
use strict; | |
# Set STDOUT to auto flush (without needing a \n) | |
$|=1; | |
my $noise = 1; | |
my $repoid = $ARGV[0]; | |
my $ep_range = $ARGV[1]; | |
if( !defined $repoid ){ | |
print STDERR "Please supply ARCHIVEID [eprintid[-eprintid]]\n"; | |
exit 1; | |
} | |
if( defined $ep_range ) | |
{ | |
if( $ep_range =~ m/^\d+$/ ) | |
{ | |
print "Checking one EPrint: $ep_range\n"; | |
} | |
elsif( my( $from, $to ) = $ep_range =~ m/^(\d+)\-(\d+)$/ ) | |
{ | |
if( !defined $from || !defined $to || $from>$to ){ | |
print STDERR "Bad input '$ep_range'. Please supply an EPrintID, or a range of EPrints to check e.g. 12-20.\n"; | |
exit 1; | |
} | |
print "Checking eprints in range $ep_range\n"; | |
} | |
else | |
{ | |
print STDERR "Bad input '$ep_range'. Please supply an EPrintID e.g. 1234, or a range e.g. 1-100\n"; | |
exit 1; | |
} | |
} | |
my $session = new EPrints::Session( 1, $repoid, $noise ); | |
if( !defined $session ) | |
{ | |
print STDERR "Failed to load repository: $repoid\n"; | |
exit 1; | |
} | |
my $dataset = $session->dataset( "eprint" ); | |
#if you want to keep track of how many things the script has looked at | |
my $info = { | |
ep_count => 0, | |
doc_count => 0 | |
}; | |
if( defined $ep_range ) | |
{ | |
$dataset->search( | |
filters => [ | |
{ meta_fields => [ "eprintid" ], value => $ep_range }, | |
] | |
)->map( \&process_eprint, $info ); | |
} | |
else | |
{ | |
#this processes every EPrint. | |
$dataset->search->map( \&process_eprint, $info ); | |
} | |
sub process_eprint | |
{ | |
my( $session, $dataset, $eprint, $info ) = @_; | |
#do something to work out EPrint language here - based on title? | |
my $ep_lang = '???'; | |
# keep track of language for all documents - this might be useful when setting the EPrint language field | |
my @doc_langs; | |
foreach my $doc ($eprint->get_all_documents) | |
{ | |
#do something to work out document language - based on PDF? | |
my $doc_lang = '???'; | |
if( defined $doc_lang ) | |
{ | |
$doc->set_value( "language", $doc_lang ); | |
push @doc_langs, $doc_lang; # save these to update the EPrint language | |
# $doc->commit(); # un-comment this line once you are confident that the script works! | |
$info->{doc_count} += 1; | |
} | |
elsif( defined $ep_lang ) | |
{ | |
$doc->set_value( "language", $ep_lang ); | |
# $doc->commit(); # un-comment this line once you are confident that the script works! | |
$info->{doc_count} += 1; | |
} | |
} | |
if( scalar @doc_langs == 1 ) | |
{ | |
#one document, one detected language, set the EPrint lang | |
$eprint->set_value( "language", $doc_langs[0] ); | |
# $eprint->commit(); # un-comment this line once you are confident that the script works! | |
} | |
else | |
{ | |
# maybe check to see how many different languages there are? If there's 5 documents, all French, | |
# then maybe we assume the EPrints should be French!? | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment