Add 'mojo-useragent-bibsonomy.pl'

This commit is contained in:
zino
2021-05-06 13:35:12 +02:00
parent a706fbbfa7
commit 49dd186111

867
mojo-useragent-bibsonomy.pl Normal file
View File

@@ -0,0 +1,867 @@
my $license = <<"LICENSE";
MIT License
Copyright (c) 2021 Marlon Maschkiwitz
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
LICENSE
#!/usr/bin/env perl
use strict;
use warnings;
use open qw(:locale);
use feature qw(say switch);
use POSIX qw(strftime);
use Data::Validate::URI qw(is_web_uri);
use File::Path qw(make_path);
use List::Util qw( min max uniq );
use File::Copy qw(copy);
use utf8;
use Benchmark;
use Data::Dumper;
use URI;
use Getopt::Long;
use File::Basename;
use LWP::UserAgent;
use DBIx::Connector;
use URI::Encode;
use Search::Sitemap::Index;
use Search::Sitemap;
#
# Global VARS
#
my (%uniq, $ua, %data, $connection, $reached_link_limit);
my $actionNr = 1;
my %config = (
'MYSQL' => {
'ACCESS' => {
'database' => 'bibsonomy',
'hostname' => '127.0.0.1',
'user' => 'mma',
'pw' => 'BP5ZDavtZN',
'port' => 3306,
},
'OPTIONS' => {
'PrintError' => 0,
'RaiseError' => 1,
'AutoCommit' => 1,
'Reconnect' => 1,
'LimitPerQuery' => 10000,
},
'BIBSONOMY' => {
"publicationsUser" => {
"query" => "SELECT simhash1, user_name FROM bibtex b JOIN user u USING (user_name) WHERE b.group=0 AND u.spammer=0 AND u.to_classify=0 AND u.updated_by != 'classifier' AND u.updated_by IS NOT NULL AND b.user_name NOT IN ('dblp', 'genealogie') ORDER BY simhash1 LIMIT %d OFFSET %d",
"id_key" => "simhash1",
"prepend" => "publication",
},
"publicationsPublic" => {
"query" => "SELECT simhash1, user_name FROM bibtex b JOIN user u USING (user_name) WHERE b.group=0 AND u.spammer=0 AND u.to_classify=0 AND u.updated_by != 'classifier' AND u.updated_by IS NOT NULL AND b.user_name NOT IN ('dblp', 'genealogie') ORDER BY simhash1 LIMIT %d OFFSET %d;",
"id_key" => "simhash1",
"prepend" => "publication",
},
"groups" => {
"query" => "SELECT group_name from groupids g WHERE g.group > 2 LIMIT %d OFFSET %d;",
"id_key" => "group_name",
"prepend" => "group",
},
"users" => {
"query" => "SELECT user_name FROM user u WHERE u.spammer=0 AND u.to_classify=0 AND u.updated_by != 'classifier' AND u.updated_by IS NOT NULL AND u.user_name NOT IN ('dblp', 'genealogie') LIMIT %d OFFSET %d;",
"id_key" => "user_name",
"prepend" => "user",
},
},
},
'PING' => {
'GOOGLE' => {
'URL' => 'http://www.google.com/ping?sitemap=',
},
},
'GetOpts' => {
'URL' => 'https://bibsonomy.org',
'ALLOWED_RESPONSES' => [],
'LINK_LIMIT' => undef,
'LINK_DEPTH' => 3,
'SITEMAP_LIMIT' => 50000,
'SITEMAP_COMPRESS' => 1,
'OUTPUT_LINKS' => 0,
'LINK_REGEX' => [],
'SITEMAP_ROOT' => undef,
'SITEMAP_SUBMIT' => 0,
'VERBOSE_LEVEL' => 0,
'OUTPUT_CONFIG' => 0,
'OUTPUT_DATA' => 0,
'CHECK_UNIQUE' => 1,
'CHECK_DEPTH' => 0,
'CHECK_RESPONSES' => 0,
'CHECK_REGEX' => 0,
},
'urlChecks' => {
'CHECK_UNIQUE' => \&checkLinkUnique,
'LINK_DEPTH' => \&checkLinkDepth,
'CHECK_REGEX' => \&checkLinkRegex,
'CHECK_RESPONSES' => \&checkLinkResponse,
},
"staticLinks" => ['gettingStarted','help','groups','popular','tags','persons'],
);
my ($getOpts, $constants, $mySQLAccess, $mySQLOptions, $mySQLBibsonomy) = ($config{'GetOpts'}, $config{'CONSTANTS'}, $config{'MYSQL'}{'ACCESS'}, $config{'MYSQL'}{'OPTIONS'}, $config{'MYSQL'}{'BIBSONOMY'});
#
# Workflow
#
&startBenchmarking();
&getCommandFlags();
&checkUrlArg();
&checkSitemapRootArg();
&checkResponsesArg();
&setConfigConstants();
&checkSitemapSubmitArg();
&initCrawling();
&canCopy();
&printScriptHeaders();
&runCrawlerBibsonomy();
&generateSitemaps();
&submitSitemaps();
&finishRun();
#
# Initialization methods
#
sub initCrawling {
&Delimiter((caller(0))[3], 4);
my $runDir = $constants->{'RUN_DIR'};
# REGEX ARGS
if (@{$getOpts->{'LINK_REGEX'}}) {
$getOpts->{'CHECK_REGEX'} = 1;
for my $i ( 0 .. scalar @{$getOpts->{'LINK_REGEX'}}-1 ) {
$getOpts->{LINK_REGEX}[$i] = qr/$getOpts->{LINK_REGEX}[$i]/;
}
}
$connection = connectToMySql($mySQLAccess->{'database'});
&logAssert("Creating run directory: $constants->{'RUN_DIR'}", 3);
make_path($constants->{'RUN_DIR'}, { chmod => 0777 });
removeDuplicateValues();
}
sub logAssert {
my ($message, $verboseLevel) = @_;
say $message if $verboseLevel <= $getOpts->{'VERBOSE_LEVEL'};
}
sub getCommandFlags {
&Delimiter((caller(0))[3], 4);
GetOptions(
'url=s' => \$getOpts->{'URL'},
'responses=s' => \@{$getOpts->{'ALLOWED_RESPONSES'}},
'link-limit=i' => \$getOpts->{'LINK_LIMIT'},
'link-depth=i' => \$getOpts->{'LINK_DEPTH'},
'sitemap-limit=i' => \$getOpts->{'SITEMAP_LIMIT'},
'compress!' => \$getOpts->{'SITEMAP_COMPRESS'},
'output-links' => \$getOpts->{'OUTPUT_LINKS'},
'link-regex=s' => \@{$getOpts->{'LINK_REGEX'}},
'sitemap-root=s' => \$getOpts->{'SITEMAP_ROOT'},
'sitemap-submit' => \$getOpts->{'SITEMAP_SUBMIT'},
'output-config' => \$getOpts->{'OUTPUT_CONFIG'},
'output-data' => \$getOpts->{'OUTPUT_DATA'},
'verbose-level=i' => \$getOpts->{'VERBOSE_LEVEL'},
'check-unique!' => \$getOpts->{'CHECK_UNIQUE'},
'check-depth!' => \$getOpts->{'CHECK_DEPTH'},
'check-responses!' => \$getOpts->{'CHECK_RESPONSES'},
) or die;
}
sub checkUrlArg() {
&Delimiter((caller(0))[3], 4);
return if !$getOpts->{'URL'};
die "--url \"$getOpts->{'URL'}\" is not a valid web uri. Make sure it starts with http or https." if !is_web_uri($getOpts->{'URL'});
}
sub checkResponsesArg {
&Delimiter((caller(0))[3], 4);
return if !@{$getOpts->{'ALLOWED_RESPONSES'}} && !$getOpts->{'CHECK_RESPONSES'};
die "You must use --responses <int> together with --check-responses" if !@{$getOpts->{'ALLOWED_RESPONSES'}} || !$getOpts->{'CHECK_RESPONSES'};
}
sub checkSitemapRootArg {
&Delimiter((caller(0))[3], 4);
return if !$getOpts->{'SITEMAP_ROOT'};
die "--sitemap-root \"$getOpts->{'SITEMAP_ROOT'}\" does not exist: $!" if (!-d $getOpts->{'SITEMAP_ROOT'});
}
sub checkSitemapSubmitArg {
&Delimiter((caller(0))[3], 4);
return if !$getOpts->{'SITEMAP_ROOT'} && !$getOpts->{'SITEMAP_SUBMIT'};
if( $getOpts->{'SITEMAP_SUBMIT'} && !$getOpts->{'SITEMAP_ROOT'} ) {
die "When using --sitemap-submit you need also set --sitemap-root, which is the absolute path of the websites root folder.";
}elsif( $getOpts->{'SITEMAP_SUBMIT'} && $getOpts->{'SITEMAP_ROOT'} ) {
die "--sitemap-root \"$getOpts->{'SITEMAP_ROOT'}\" does not exist." if (!-e $getOpts->{'SITEMAP_ROOT'});
die "--sitemap-root \"$getOpts->{'SITEMAP_ROOT'}\" is not a directory." if (!-d $getOpts->{'SITEMAP_ROOT'});
}
}
sub removeDuplicateValues {
&Delimiter((caller(0))[3], 4);
foreach(['ALLOWED_RESPONSES', 'LINK_REGEX']) {
@{$getOpts->{$_}} = uniq(@{$getOpts->{$_}});
}
}
sub setConfigConstants {
&Delimiter((caller(0))[3], 4);
my $u = URI->new( $getOpts->{'URL'} );
($data{'URL'}{'SITEMAP_ROOT'} = $u->canonical->scheme . '://' . $u->canonical->host . $u->canonical->path) =~ s/\/$//;
$constants->{'URL_HOST'} = remove_www(URI->new($getOpts->{'URL'})->canonical->host );
$constants->{'TIMESTAMP'} = GetTimestamp('YMDHMS');
$constants->{'RUN_DIR'} = dirname(__FILE__) . '/run/' . $constants->{'TIMESTAMP'};
$constants->{'SITEMAP_EXT'} = '.xml.gz' if $getOpts->{'SITEMAP_COMPRESS'};
$constants->{'SITEMAP_EXT'} = '.xml' if !$getOpts->{'SITEMAP_COMPRESS'};
$constants->{'SITEMAP_FILE'} = $constants->{'RUN_DIR'} . '/sitemap' . $constants->{'SITEMAP_EXT'};
$constants->{'SITEMAP_INDEX_FILE'} = $constants->{'RUN_DIR'} . '/sitemap-index' . $constants->{'SITEMAP_EXT'};
$constants->{'OUPUT_FILE'} = $constants->{'RUN_DIR'} . '/links';
}
#
# BibSonomy Methods
#
sub getBibsonomyUrls {
&Delimiter((caller(0))[3], 4);
foreach my $key (keys %{$mySQLBibsonomy}) {
my $offset = 0;
printActionNrString("Querying $key......");
while ( my %offset = %{ db_query(sprintf($mySQLBibsonomy->{$key}{"query"}, $mySQLOptions->{'LimitPerQuery'}, $offset),$connection,'selectall_hashref',$mySQLBibsonomy->{$key}{"id_key"})}) {
return if is_link_limit();
$offset += $mySQLOptions->{'LimitPerQuery'};
my $valid = createBibsonomyUrls(\%offset, $key);
@{$data{'LINKS'}{200}} = (@{$data{'LINKS'}{200}}, @{$valid});
&logAssert(('-> ' . scalar @{$data{'LINKS'}{200}}), 2);
}
}
}
sub createBibsonomyUrls {
&Delimiter((caller(0))[3], 4);
my ($results_ref, $sqlKey) = @_;
my @valid;
my $prepend = "$data{'URL'}{'SITEMAP_ROOT'}/$mySQLBibsonomy->{$sqlKey}{'prepend'}";
foreach my $key ( keys %{$results_ref} ) {
my $link_string = "$prepend/$key";
$link_string .= "/$$results_ref{$key}{'user_name'}" if $sqlKey eq "publicationsUser";
push(@valid, $link_string) if &is_valid_link(URI->new($link_string));
}
return \@valid;
}
sub runCrawlerBibsonomy {
&Delimiter((caller(0))[3], 4);
printActionNrString("Pushing static links...");
@{$data{'LINKS'}{200}} = map { "$data{'URL'}{'SITEMAP_ROOT'}/$_" } (@{$config{'staticLinks'}});
&logAssert(('-> ' . scalar @{$data{'LINKS'}{200}}, 2));
getBibsonomyUrls();
}
#
# Sitemap Methods
#
sub submitSitemaps {
&Delimiter((caller(0))[3], 4);
my $sitemapUrls = copySitemaps() if $getOpts->{'SITEMAP_ROOT'};
return if !$getOpts->{'SITEMAP_SUBMIT'};
pingSearchEngine($sitemapUrls);
}
sub copySitemaps {
&Delimiter((caller(0))[3], 4);
my @sitemapUrls;
foreach my $i (0 .. scalar @{$data{'SITEMAPS'}}-1 ) {
my ($file,$dir,$ext) = fileparse($data{'SITEMAPS'}[$i], qr/\.[^.]*/);
my $new_location = $getOpts->{'SITEMAP_ROOT'} . '/' . $file . $ext;
my $sitemap_url = $data{'URL'}{'SITEMAP_ROOT'} . '/' . $file . $ext;
push( @sitemapUrls, $sitemap_url );
copy $data{'SITEMAPS'}[$i], $new_location or warn "Copy failed: $new_location $!";
}
return \@sitemapUrls;
}
sub pingSearchEngine {
&Delimiter((caller(0))[3], 4);
my $sitemapUrls_ref = shift;
my $uri = URI::Encode->new( { encode_reserved => 0 } );
for my $url (@{$sitemapUrls_ref}) {
for my $engine( keys %{$config{'PING'}} ) {
my $ping_url = $config{'PING'}{$engine}{'URL'} . $uri->encode($url);
&logAssert("\tSubmitting to $engine: $url -> $ping_url", 3);
my $code = &getStatusCode($ping_url);
&logAssert("\t$code", 2);
}
}
}
sub printActionNrString{
&Delimiter((caller(0))[3], 4);
my $message = shift;
return if $getOpts->{'VERBOSE_LEVEL'} < 1;
say "$actionNr) $message";
$actionNr++;
}
sub createSitemap {
&Delimiter((caller(0))[3], 4);
my ($index, $sitemap) = @_;
my ($file,$dir,$ext) = fileparse($constants->{'SITEMAP_FILE'}, qr/\.[^.]*/);
my $output_file = sprintf("%ssitemap_%d%s", $dir,$index, $constants->{'SITEMAP_EXT'});
printActionNrString("Writing $output_file...");
$sitemap->write($output_file);
push( @{$data{'SITEMAPS'}}, $output_file );
}
sub addToSitemapIndex {
&Delimiter((caller(0))[3], 4);
my ($index, $indexSitemap) = @_;
my $sitemap_file = sprintf("%s/sitemap_%d%s", $data{'URL'}{'SITEMAP_ROOT'}, $index, $constants->{'SITEMAP_EXT'});
$indexSitemap->add(Search::Sitemap::URL->new(loc => $sitemap_file));
}
sub generateSitemaps {
&Delimiter((caller(0))[3], 4);
my $indexSitemap = Search::Sitemap::Index->new('pretty' => 1);
my $sitemap = Search::Sitemap->new(pretty => 1);
my ($i, $index) = (0, 0);
foreach my $url ( @{$data{'LINKS'}{'200'}} ) {
$sitemap->add(Search::Sitemap::URL->new(loc => $url));
$i++;
if( $i == $getOpts->{'SITEMAP_LIMIT'} ) {
$index++;
$i = 0;
createSitemap($index, $sitemap);
addToSitemapIndex($index, $indexSitemap);
$sitemap = Search::Sitemap->new(pretty => 1);
}
}
writeSitemapFiles($sitemap, $indexSitemap, $index, $i);
}
sub writeSitemapFiles {
&Delimiter((caller(0))[3], 4);
my ($sitemap, $indexSitemap, $index, $i) = @_;
if( $i < $getOpts->{'SITEMAP_LIMIT'} && $index == 0 ) {
$sitemap->write( $constants->{'SITEMAP_FILE'} );
push( @{$data{'SITEMAPS'}}, $constants->{'SITEMAP_FILE'} );
}elsif( $i < $getOpts->{'SITEMAP_LIMIT'} && $index > 0 && $i > 0) {
$index++;
createSitemap($index, $sitemap);
addToSitemapIndex($index, $indexSitemap);
createSitemapIndex($indexSitemap);
}elsif( $i < $getOpts->{'SITEMAP_LIMIT'} && $index > 0 && $i == 0) {
createSitemapIndex($indexSitemap);
}
}
sub createSitemapIndex {
&Delimiter((caller(0))[3], 4);
my $indexSitemap = shift;
printActionNrString("Writing $constants->{'SITEMAP_INDEX_FILE'}...");
$indexSitemap->write( $constants->{'SITEMAP_INDEX_FILE'} );
push( @{$data{'SITEMAPS'}}, $constants->{'SITEMAP_INDEX_FILE'} );
}
#
# URL validate Methods
#
sub is_valid_link {
&Delimiter((caller(0))[3], 4);
my ($link) = @_;
my $link_depth = () = $link->path =~ /\/.+?/g if $getOpts->{'LINK_DEPTH'};
my $link_host = &remove_www($link->host);
&logAssert("-> $link", 3);
my $code = getStatusCode($link->canonical->as_string) if $getOpts->{'CHECK_RESPONSES'};
for my $key (keys %{$config{'urlChecks'}}) {
next if !$getOpts->{$key};
return 0 if !$config{'urlChecks'}{$key}->($link->canonical->as_string, $link->scheme, $link_depth, $link_host, $code);
}
return 1;
}
sub checkLinkUnique {
&Delimiter((caller(0))[3], 4);
my ($link_string, $link_protocol, $link_depth, $link_host) = @_;
if ( ++$uniq{$link_string} > 1 ) {
&logAssert("\tFAIL (UNIQ)", 3);
return 0;
}
return 1;
}
sub checkLinkDepth {
&Delimiter((caller(0))[3], 4);
my ($link_string, $link_protocol, $link_depth, $link_host) = @_;
if ($link_depth > $getOpts->{'LINK_DEPTH'}) {
&logAssert("\tFAIL (DEPTH $link_depth)", 3);
return 0;
}
return 1;
}
sub checkLinkRegex {
&Delimiter((caller(0))[3], 4);
my ($link_string, $link_protocol, $link_depth, $link_host) = @_;
if(!&check_multiple_regex($link_string) ) {
&logAssert("\tFAIL (REGEX)", 3);
return 0;
}
return 1;
}
sub getStatusCode {
&Delimiter((caller(0))[3], 4);
my ($link_string, $link_protocol, $link_depth, $link_host) = @_;
my $uaLWP = new LWP::UserAgent;
my $response = $uaLWP->get($link_string);
return $response->code;
}
sub checkLinkResponse {
&Delimiter((caller(0))[3], 4);
my ($link_string, $link_protocol, $link_depth, $link_host, $code) = @_;
if (!isInArray($code, $getOpts->{'ALLOWED_RESPONSES'})) {
&logAssert("\tFAIL (Status-Code not allowed: $code)", 3);
return 0;
}
return 1;
}
#
# Finish Methods
#
sub calcMinMaxAvg {
return sprintf("URL lengths: min=%d, max=%d, avg=%d", get_min_max_avg() ) if $getOpts->{'VERBOSE_LEVEL'} >= 1;
}
sub calcRuntime {
return 'Runtime: ', &get_runtime($data{'STATISTICS'}{'TIME'}{'START'}, time()) if $getOpts->{'VERBOSE_LEVEL'} >= 1;
}
sub endBenchmark {
return if $getOpts->{'VERBOSE_LEVEL'} < 1;
my $t1 = Benchmark->new;
my $td = timediff($t1, $data{'STATISTICS'}{'BENCHMARK_START'});
return "Benchmark: ", timestr($td);
}
sub runCompleted {
return if $getOpts->{'VERBOSE_LEVEL'} < 1;
my $timestamp = &GetTimestamp('YMDHMS_r');
return "RUN $constants->{'TIMESTAMP'} completed @ $timestamp";
}
sub finishRun {
&Delimiter((caller(0))[3], 4);
$connection->disconnect();
outputLinks() if $getOpts->{'OUTPUT_LINKS'};
say runCompleted();
&logAssert(('-' x 80), 1);
say calcMinMaxAvg();
say calcRuntime();
say endBenchmark();
printToFile("$constants->{'RUN_DIR'}/config", \%config) if $getOpts->{'OUTPUT_CONFIG'};
printToFile("$constants->{'RUN_DIR'}/data", \%data) if $getOpts->{'OUTPUT_DATA'};
}
sub get_min_max_avg {
&Delimiter((caller(0))[3], 4);
return (0, 0, 0) if !@{$data{'LINKS'}{200}};
my ($sum);
my @all_links;
for my $key( keys %{$data{'LINKS'}} ) {
@all_links = (@all_links, @{$data{'LINKS'}{$key}});
}
foreach my $i (0 .. $#all_links) {
$all_links[$i] = length($all_links[$i]);
$sum += $all_links[$i];
}
return (scalar(min(@all_links)), scalar(max(@all_links)), int($sum / @all_links));
}
sub get_checked_links {
&Delimiter((caller(0))[3], 4);
my $size = 0;
for my $key( keys %{$data{'LINKS'}} ) {
$size += scalar @{$data{'LINKS'}{$key}};
}
return $size;
}
#
# Helper methods
#
sub isInArray {
&Delimiter((caller(0))[3], 4);
my ($element, $arr_ref) = @_;
my %hash = map { $_ => 1 } @{$arr_ref};
return exists($hash{$element});
}
sub connectToMySql {
&Delimiter((caller(0))[3], 4);
my ($db) = @_;
my $connectionInfo = "dbi:mysql:database=$db;host=$mySQLAccess->{'hostname'};port=$mySQLAccess->{'port'}"; # assign config mysql values to a connection variable
my $l_connection = DBIx::Connector->new(
$connectionInfo,
$mySQLAccess->{'user'},
$mySQLAccess->{'pw'},
{
RaiseError => $mySQLOptions->{'RaiseError'}, # SUGGESTED BY AnyEvent::DBI::MySQL
AutoCommit => $mySQLOptions->{'AutoCommit'},
mysql_auto_reconnect => $mySQLOptions->{'Reconnect'},
}
);
return $l_connection->dbh;
}
sub startBenchmarking {
&Delimiter((caller(0))[3], 4);
$data{'STATISTICS'}{'BENCHMARK_START'} = Benchmark->new;
$data{'STATISTICS'}{'TIME'}{'START'} = time();
$data{'STATISTICS'}{'CONTENT_LENGTH'} = 0;
}
sub printScriptHeaders {
&Delimiter((caller(0))[3], 4);
return if $getOpts->{'VERBOSE_LEVEL'} < 1;
my $timestamp_r = GetTimestamp('YMDHMS_r');
say '=' x 80;
say "$license";
say '-' x 80;
say 'Sitemap Generator v1.0.0 by Marlon Maschkiwitz';
say 'E-Mail: zino@zinomedia.de';
say '=' x 80;
say 'Start checking @ ', $timestamp_r . "\n";
}
sub printToFile{
&Delimiter((caller(0))[3], 4);
my ($filepath, $ref) = @_;
# DATA DUMPER UTF8 HACK
no warnings 'redefine';
local *Data::Dumper::qquote = sub { qq["${\(shift)}"] };
local $Data::Dumper::Useperl = 1;
open my $FILE, '>:encoding(UTF-8)', $filepath;
print $FILE Dumper $ref;
close $FILE;
}
sub canCopy {
&Delimiter((caller(0))[3], 4);
return if !$getOpts->{'SITEMAP_SUBMIT'} && !$getOpts->{'SITEMAP_ROOT'};
my $tmpfile = &tempfile();
my $new_location = "$getOpts->{'SITEMAP_ROOT'}/tmp";
copy $tmpfile, $new_location or do {
unlink $tmpfile;
die "Cannot copy to $new_location: $!";
};
unlink($tmpfile, $new_location);
}
sub db_query {
&Delimiter((caller(0))[3], 4);
my ($query, $connection, $switch, @rest) = @_;
my ($response, $statement);
$response = $connection->selectall_hashref($query, $rest[0]) or die $connection->errstr;
$statement->finish if $statement;
return $response;
}
sub trim {
&Delimiter((caller(0))[3], 4);
my $s = shift;
return $s =~ s/^\s+|\s+$//g;
}
sub is_link_limit {
&Delimiter((caller(0))[3], 4);
return if !$getOpts->{'LINK_LIMIT'};
if( exists $data{'LINKS'}{200} && scalar @{$data{'LINKS'}{200}} >= $getOpts->{'LINK_LIMIT'} ) {
&logAssert(('LINK_LIMIT of ' . $getOpts->{'LINK_LIMIT'} . ' reached'), 3);
$reached_link_limit = 1;
return 1;
}else {
return 0;
}
}
sub remove_www {
&Delimiter((caller(0))[3], 4);
my $str = shift;
return $str =~ s/^www\.//i;
}
sub check_multiple_regex {
&Delimiter((caller(0))[3], 4);
my $str = shift;
no warnings 'experimental::smartmatch';
given ($str) {
when (@{$getOpts->{'LINK_REGEX'}}) {
return 1;
}
default {
return 0;
}
}
return 0;
}
sub GetTimestamp {
&Delimiter((caller(0))[3], 4);
my $format = shift;
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
return sprintf( "%04d%02d%02d_%02d%02d%02d", $year+1900,$mon+1,$mday,$hour,$min,$sec ) if $format eq 'YMDHMS';
return sprintf( "%04d-%02d-%02d %02d:%02d:%02d", 1900 + $year, $mon + 1, $mday, $hour, $min, $sec ) if $format eq 'YMDHMS_r';
}
sub Delimiter {
my ($subName, $verboseLevel) = @_;
return if $verboseLevel > $getOpts->{'VERBOSE_LEVEL'};
say "\n" . "-" x 80;
print "> " . $subName;
say "\n" . '-' x 80;
}
sub get_runtime {
&Delimiter((caller(0))[3], 4);
my ($t1, $t2) = @_;
return strftime( "%T", gmtime( $t2-$t1 ) );
}
sub tempfile {
&Delimiter((caller(0))[3], 4);
return if !-e $constants->{'RUN_DIR'};
my $filename = "$constants->{'RUN_DIR'}/tmp";
open(my $fh, '>', $filename) or warn "Could not create tmp file '$filename' $!";
close($fh);
return $filename;
}
sub outputLinks {
return if (!$getOpts->{'OUTPUT_LINKS'});
for my $key( keys %{$data{'LINKS'}} ) {
&logAssert(('Total links with status code $key found: ', scalar @{$data{'LINKS'}{$key}}), 1);
printToFilePlain("$constants->{'OUPUT_FILE'}_$key", \@{$data{'LINKS'}{$key}});
}
}
sub printToFilePlain {
&Delimiter((caller(0))[3], 4);
my ($filepath, $ref) = @_;
open my $FILE, '>:encoding(UTF-8)', $filepath;
print $FILE join("\n", @{$ref});
close $FILE;
}
__END__
=pod
=encoding UTF-8
=head1 NAME
BibSonomy Dynamic Sitemap Generator
=head1 IMPORTANT USAGE GUIDELINES
To run the script with the default options, run the script without arguments.
Some routines may require root level privileges.
=head1 SYNOPSIS
usage: script [--url <string>] [--link-limit <int>] [--link-depth <int>] [--sitemap-limit <int>] [--link-regex <string>] [--sitemap-root <string>] [--verbose-level <int>] [--responses <int>] [--nocompress] [--output-links] [--nocheck-unique] [--check-depth] [--check-responses] [--sitemap-submit] [--output-config]
=head1 OPTION ARGUMENTS
--url <string> | Domain including protocol that is placed in front of the URL generation process
--link-limit <int> | Maximum number of generated URLs. Note that generated URLs can differ from the limit provided up to ~10.000 URLs due to the database query offset.
--link-depth <int> | Maximum depth of generated URLs. Works only alongside --check-depth. Default: 3
--sitemap-limit <int> | Limit that determines when sitemap gets split into multiple sitemaps. Once a split occurs, a sitemap index including all single sitemaps will be also generated. Default: 50000
--link-regex <string> | If provided links must match regex to count as valid. Use multiple times to provide more than one. Default: Disabled
--sitemap-root <string> | Defines location of the root directory, to which sitemapsx will be moved. Only works alongside with --sitemap-submit. Default: Disabled
--verbose-level <int> | Specifies the level for the level of the displayed STDOUT messages
--responses <int> | Add response codes to count as valid link. Use multiple times to provide more than one. Only works alongside --check-responses. Default: 200
=head1 FLAG ARGUMENTS
--nocompress | Sitemaps will not be comressed as .gz files. Default: Compressed
--output-links | Determines if plain files with urls will be generated. Each status code generates a seperate file with the corresponding links. Default: Disabled
--nocheck-unique | Allows duplicate urls in the sitemaps.
--check-depth | Check the depth of the URLs. Maximum depth can be overwritten via --link-depths.
--check-responses | Check the status code of the generated URL. Permitted status codes can be overwritten via--responses
--sitemap-submit | Determines if sitemap files will be submitted to search engines. Sitemap files need to be moved to website root directory, therefore only works alongside --website-root.
--output-config | Output hash config to file config in the current run dir
--output-data | Output hash data to file data in the current run dir
=head1 DESCRIPTION
The Sitemap Protocol allows you to inform search engine crawlers about URLs on your Web sites that are available for crawling.
A Sitemap consists of a list of URLs and may also contain additional information about those URLs, such as when they were last modified, how frequently they change, etc.
This script allows you to dynamically generate URLs from the BibSonomy database and to create, modify and submit sitemaps to multiple search engines.
=head1 REQUIRED MODULES
L<Data::Validate::URI>
L<File::Path>
L<List::Util>
L<File::Copy>
L<Data::Dumper>
L<URI>
L<URI::Encode>
L<Getopt::Long>
L<File::Basename>
L<Search::Sitemap::Index>
L<LWP::UserAgent>
L<DBIx::Connector>
L<Search::Sitemap>
=head1 AUTHOR
Marlon Maschkiwitz - zino@zinomedia.de
=head1 COPYRIGHT AND LICENSE
MIT License
Copyright (c) 2021 Marlon Maschkiwitz
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
=cut