Skip to content

Instantly share code, notes, and snippets.

@jesusbagpuss
Created January 17, 2023 09:20
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save jesusbagpuss/a8cc8c5328aa6e33e068609bc6f3d6ca to your computer and use it in GitHub Desktop.
Check all EPrints, and all documents, and set language
#!/usr/bin/perl -w
### the 'use lib' line assumes the scipt is saved into [EPRINTS_ROOT]/bin/local/
### If the script is saved in another location, please update that line accordingly.
use FindBin;
use lib "$FindBin::Bin/../../perl_lib";
use EPrints;
use strict;
# Set STDOUT to auto flush (without needing a \n)
$|=1;
my $noise = 1;
my $repoid = $ARGV[0];
my $ep_range = $ARGV[1];
if( !defined $repoid ){
print STDERR "Please supply ARCHIVEID [eprintid[-eprintid]]\n";
exit 1;
}
if( defined $ep_range )
{
if( $ep_range =~ m/^\d+$/ )
{
print "Checking one EPrint: $ep_range\n";
}
elsif( my( $from, $to ) = $ep_range =~ m/^(\d+)\-(\d+)$/ )
{
if( !defined $from || !defined $to || $from>$to ){
print STDERR "Bad input '$ep_range'. Please supply an EPrintID, or a range of EPrints to check e.g. 12-20.\n";
exit 1;
}
print "Checking eprints in range $ep_range\n";
}
else
{
print STDERR "Bad input '$ep_range'. Please supply an EPrintID e.g. 1234, or a range e.g. 1-100\n";
exit 1;
}
}
my $session = new EPrints::Session( 1, $repoid, $noise );
if( !defined $session )
{
print STDERR "Failed to load repository: $repoid\n";
exit 1;
}
my $dataset = $session->dataset( "eprint" );
#if you want to keep track of how many things the script has looked at
my $info = {
ep_count => 0,
doc_count => 0
};
if( defined $ep_range )
{
$dataset->search(
filters => [
{ meta_fields => [ "eprintid" ], value => $ep_range },
]
)->map( \&process_eprint, $info );
}
else
{
#this processes every EPrint.
$dataset->search->map( \&process_eprint, $info );
}
sub process_eprint
{
my( $session, $dataset, $eprint, $info ) = @_;
#do something to work out EPrint language here - based on title?
my $ep_lang = '???';
# keep track of language for all documents - this might be useful when setting the EPrint language field
my @doc_langs;
foreach my $doc ($eprint->get_all_documents)
{
#do something to work out document language - based on PDF?
my $doc_lang = '???';
if( defined $doc_lang )
{
$doc->set_value( "language", $doc_lang );
push @doc_langs, $doc_lang; # save these to update the EPrint language
# $doc->commit(); # un-comment this line once you are confident that the script works!
$info->{doc_count} += 1;
}
elsif( defined $ep_lang )
{
$doc->set_value( "language", $ep_lang );
# $doc->commit(); # un-comment this line once you are confident that the script works!
$info->{doc_count} += 1;
}
}
if( scalar @doc_langs == 1 )
{
#one document, one detected language, set the EPrint lang
$eprint->set_value( "language", $doc_langs[0] );
# $eprint->commit(); # un-comment this line once you are confident that the script works!
}
else
{
# maybe check to see how many different languages there are? If there's 5 documents, all French,
# then maybe we assume the EPrints should be French!?
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment