I have little scraping application and trying to add multithreading to it. Here is code (MyMech is WWW::Mechanize subclass used to process HTTP errors):
#!/usr/bin/perl
use strict;
use MyMech;
use File::Basename;
use File::Path;
use HTML::Entities;
use threads;
use threads::shared;
use Thread::Queue;
use List::Util qw( max sum );
my $page = 1;
my %CONFIG = read_config();
my $mech = MyMech->new( autocheck => 1 );
$mech->quiet(0);
$mech->get( $CONFIG{BASE_URL} . "/site-map.php" );
my #championship_links =
$mech->find_all_links( url_regex => qr/\d{4}-\d{4}\/$/ );
foreach my $championship_link (#championship_links) {
my #threads;
my $queue = Thread::Queue->new;
my $queue_processed = Thread::Queue->new;
my $url = sprintf $championship_link->url_abs();
print $url, "\n";
next unless $url =~ m{soccer}i;
$mech->get($url);
my ( $last_round_loaded, $current_round ) =
find_current_round( $mech->content() );
unless ($last_round_loaded) {
print "\tLoading rounds data...\n";
$mech->submit_form(
form_id => "leagueForm",
fields => {
round => $current_round,
},
);
}
my #match_links =
$mech->find_all_links( url_regex => qr/matchdetails\.php\?matchid=\d+$/ );
foreach my $link (#match_links) {
$queue->enqueue($link);
}
print "Starting printing thread...\n";
my $printing_thread = threads->create(
sub { printing_thread( scalar(#match_links), $queue_processed ) } )
->detach;
push #threads, $printing_thread;
print "Starting threads...\n";
foreach my $thread_id ( 1 .. $CONFIG{NUMBER_OF_THREADS} ) {
my $thread = threads->create(
sub { scrape_match( $thread_id, $queue, $queue_processed ) } )
->join;
push #threads, $thread;
}
undef $queue;
undef $queue_processed;
foreach my $thread ( threads->list() ) {
if ( $thread->is_running() ) {
print $thread->tid(), "\n";
}
}
#sleep 5;
}
print "Finished!\n";
sub printing_thread {
my ( $number_of_matches, $queue_processed ) = #_;
my #fields =
qw (
championship
year
receiving_team
visiting_team
score
average_home
average_draw
average_away
max_home
max_draw
max_away
date
url
);
while ($number_of_matches) {
if ( my $match = $queue_processed->dequeue_nb ) {
open my $fh, ">>:encoding(UTF-8)", $CONFIG{RESULT_FILE} or die $!;
print $fh join( "\t", #{$match}{#fields} ), "\n";
close $fh;
$number_of_matches--;
}
}
threads->exit();
}
sub scrape_match {
my ( $thread_id, $queue, $queue_processed ) = #_;
while ( my $match_link = $queue->dequeue_nb ) {
my $url = sprintf $match_link->url_abs();
print "\t$url", "\n";
my $mech = MyMech->new( autocheck => 1 );
$mech->quiet(0);
$mech->get($url);
my $match = parse_match( $mech->content() );
$match->{url} = $url;
$queue_processed->enqueue($match);
}
return 1;
}
And i have some strange things with this code. Sometimes it run but sometimes it exit with no errors (at the ->detach point). I know that #match_links contain data but threads are not created and it just close. Usually it terminates after processing second $championship_link entry.
May be i'm doing something wrong?
Update
Here is code for find_current_round subroutine (but i'm sure it's not related to the question):
sub find_current_round {
my ($html) = #_;
my ($select_html) = $html =~ m{
<select\s+name="round"[^>]+>\s*
(.+?)
</select>
}isx;
my ( $option_html, $current_round ) = $select_html =~ m{
(<option\s+value="\d+"(?:\s+ selected="selected")?>(\d+)</option>)\Z
}isx;
my ($last_round_loaded) = $option_html =~ m{selected};
return ( $last_round_loaded, $current_round );
}
First off - don't use dequeue_nb(). This is a bad idea, because if a queue is temporarily empty, it'll return undef and your thread will exit.
Use instead dequeue and and end. dequeue will block, but once you end your queue, the while will exit.
You're also doing some decidedly odd things with your threads - I would suggest that you rarely want to detach a thread. You're just assuming your thread is going to complete before your program, which isn't a good plan.
Likewise this;
my $thread = threads->create(
sub { scrape_match( $thread_id, $queue, $queue_processed ) } )
->join;
You're spawning a thread, and then instantly joining it. And so that join call will... block waiting for your thread to exit. You don't need threads at all to do that...
You also scope your queues within your foreach loop. I don't think that's a good plan. I would suggest instead - scope them externally, and spawn a defined number of 'worker' threads (and one 'printing' thread).
And then just feed them through the queue mechanism. Otherwise you'll end up creating multiple queue instances, because they're lexically scoped.
And once you've finished queuing stuff, issue a $queue -> end which'll terminate the while loop.
You also don't need to give a thread a $thread_id because ... they already have one. Try: threads -> self -> tid(); instead.
Related
i am trying to pass a subroutine from an self written module to threads using the following code.
This is my first time using threads so I'm kinda not familiar with it.
Main Script (shortend)
#!/usr/bin/perl -w
use strict;
use threads;
use lib 'PATH TO LIB';
use goldstandard;
my $delete_raw_files = 0;
my $outfolder = /PATH/;
my %folder = goldstandard -> create_folder($outfolder,$delete_raw_files);
&tagging if $tagging == 1;
sub tagging{
my %hash = goldstandard -> tagging_hash(\%folder);
my #threads;
foreach(keys %hash){
if($_ =~ m/mate/){
my $arguments = "goldstandard -> mate_tagging($hash{$_}{raw},$hash{$_}{temp},$hash{$_}{tagged},$mate_anna,$mate_model)";
push(#threads,$arguments);
}
if($_ =~ m/morpheus/){
my $arguments = "goldstandard -> morpheus_tagging($hash{$_}{source},$hash{$_}{tagged},$morpheus_stemlib,$morpheus_cruncher)";
push(#threads,$arguments)
}
}
foreach(#threads){
my $thread = threads->create($_);
$thread ->join();
}
}
Module
package goldstandard;
use strict;
use warnings;
sub mate_tagging{
my $Referenz = shift;
my $input = shift;
my $output_temp_dir = shift;
my $output_mate_human = shift;
my $anna = shift;
my $model = shift;
opendir(DIR,"$input");
my #dir = readdir(DIR);
my $anzahl = #dir;
foreach(#dir){
unless($_ =~ m/^\./){
my $name = $_;
my $path = $input . $_;
my $out_temp = $output_temp_dir . $name;
my $out_mate_human_final = $output_mate_human . $name;
qx(java -Xmx10G -classpath $anna is2.tag.Tagger -model $model -test $path -out $out_temp);
open(OUT, "> $out_mate_human_final");
open(TEMP, "< $out_temp");
my $output_text;
while(<TEMP>){
unless($_ =~ m/^\s+$/){
if ($_ =~ m/^\d+\t(.*?)\t_\t_\t_\t(.*?)\t_\t/) {
my $tags = $2;
my $words = $1;
print OUT "$words\t$tags\n";
}
}
}
}
}
}
sub morpheus_tagging{
my $Referenz = shift;
my $input = shift;
my $output = shift;
my $stemlib = shift;
my $cruncher = shift;
opendir(DIR,"$input");
my #dir = readdir(DIR);
foreach(#dir){
unless($_ =~ m/^\./){
my $name = $_;
my $path = $input . $_;
my $out = $output . $name;
qx(env MORPHLIB='$stemlib' '$cruncher' < '$path' > '$out');
}
}
}
1;
Executing this code gets me
Thread 1 terminated abnormally: Undefined subroutine &main::goldstandard -> morpheus_tagging(...) called at ... line 43.
I guess eather the way I am calling the treads or the way I am providing the arguments are wrong. I Hope some can help me with that? I Also found something on safe and unsafe modules bum I'm not sure is this is realy the problem.
I guess eather the way I am calling the treads or the way I am providing the arguments are wrong. I Hope some can help me with that? I Also found something on safe and unsafe modules bum I'm not sure is this is realy the problem.Thanks in advance
You must pass the name of a sub or a reference to a sub, plus arguments, to threads->create. So you need something like
my $method_ref = $invoker->can($method_name);
threads->create($method_ref, $invoker, #args);
That said, passing arguments to threads->create has issues that can be avoided by using a closure.
threads->create(sub { $invoker->$method_name(#args) })
The above can be written more simply as follows:
async { $invoker->$method_name(#args) }
This gets us the following:
sub tagging {
my %hash = goldstandard->tagging_hash(\%folder);
my #jobs;
for (keys %hash) {
if (/mate/) {
push #jobs, [ 'goldstandard', 'mate_tagging',
$hash{$_}{raw},
$hash{$_}{temp},
$hash{$_}{tagged},
$mate_anna,
$mate_model,
];
}
if (/morpheus/) {
push #jobs, [ 'goldstandard', 'morpheus_tagging',
$hash{$_}{source},
$hash{$_}{tagged},
$morpheus_stemlib,
$morpheus_cruncher,
];
}
}
my #threads;
for my $job (#jobs) {
my ($invoker, $method_name, #args) = #$job;
push #threads, async { $invoker->$method_name(#args) };
}
$_->join for #threads;
}
or just
sub tagging {
my %hash = goldstandard->tagging_hash(\%folder);
my #threads;
for (keys %hash) {
if (/mate/) {
push #threads, async {
goldstandard->mate_tagging(
$hash{$_}{raw},
$hash{$_}{temp},
$hash{$_}{tagged},
$mate_anna,
$mate_model,
);
};
}
if (/morpheus/) {
push #threads, async {
goldstandard->morpheus_tagging(
$hash{$_}{source},
$hash{$_}{tagged},
$morpheus_stemlib,
$morpheus_cruncher,
);
};
}
}
$_->join for #threads;
}
Notes that I delayed the calls to join until after all the threads are created. Your way made it so only one thread would run at a time.
But what we have isn't great. We have no way of limiting how many threads are active at a time, and we (expensively) create many threads instead of reusing them. We can use a worker pool to solve both of these problems.
use constant NUM_WORKERS => 5;
use Thread::Queue 3.01 qw( );
my $q;
sub tagging {
my %hash = goldstandard->tagging_hash(\%folder);
my #threads;
for (keys %hash) {
if (/mate/) {
$q->enqueue(sub {
goldstandard->mate_tagging(
$hash{$_}{raw},
$hash{$_}{temp},
$hash{$_}{tagged},
$mate_anna,
$mate_model,
);
});
}
if (/morpheus/) {
$q->enqueue(sub {
goldstandard->morpheus_tagging(
$hash{$_}{source},
$hash{$_}{tagged},
$morpheus_stemlib,
$morpheus_cruncher,
);
});
}
}
}
{
$q = Thread::Queue->new();
for (1..NUM_WORKERS) {
async {
while ( my $job = $q->dequeue() ) {
$job->();
}
};
}
... call tagging and whatever ...
$q->end();
$_->join() for threads->list();
}
I have made a Perl script where I create threads (limited in terms of threads running in the meantime) and each threads create its own children which should be also limited in number.
Where I host my script, I cannot launch more than X threads per Perl script in the meantime. In the below example, I have X = 3 x 7 = 21 threads maximum in the meantime.
3 for the 1st job ($nb_process_first)
7 for the 2nd job ($nb_process_second)
Questions:
Is there a better way to manage threads and their children? (queues for example - could you please bring me some code example because I have tried with no success)
My current script is not terminating with all the threads joined, although I use a loop on all running threads to join them (cf. at the end of the script).
#!/usr/bin/perl -s
use threads;
my #threads;
my $nb_process_first = 3;
my #running = ();
print "START" . "\n";
$current = 1;
while ( $current <= 10 ) {
#running = threads->list(threads::running);
if ( scalar #running < $nb_process_first ) {
print "Launch firstJob=" . scalar #running . "\n";
my $thread = threads->create( \&firstJob );
push( #threads, $thread );
} else {
redo;
}
$current++;
}
my #joinable = threads->list(threads::joinable);
while ( scalar #joinable != 0 ) {
foreach my $thr ( threads->list() ) {
$thr->join();
}
#joinable = threads->list(threads::joinable);
}
print "END" . "\n";
sub secondJob {
for ( $i = 0; $i <= 15; $i++ ) {
print "secondJob=" . $i . "\n";
sleep 1;
}
threads->exit();
}
sub firstJob {
my $nb_process_second = 7;
my #running = ();
$current = 1;
while ( $current <= 10 ) {
#running = threads->list(threads::running);
if ( scalar #running < $nb_process_second ) {
print "firstJob/Launch secondJob=" . scalar #running . "-" . $current . "\n";
my $secondthread = threads->create( \&secondJob );
push( #threads, $secondthread );
sleep 2;
}
$current++;
}
threads->exit();
}
Thread::Queue is a handy model for basic 'worker thread' model of threaded code.
It goes a bit like this:
#!/usr/bin/perl
use strict;
use warnings;
use threads;
use Thread::Queue;
my $firstworkitem_q = Thread::Queue -> new();
my $secondworkitem_q = Thread::Queue -> new();
my $nthreads = 10;
sub first_worker {
while ( my $item = $firstworkitem_q -> dequeue() ) {
print "First worker picked up $item, and queues it to second worker\n";
$secondworkitem_q -> enqueue ( $item );
}
}
sub second_worker {
while ( my $item = $secondworkitem_q -> dequeue() ) {
print "Second worker got $item";
}
}
my #first_workers;
for ( 1..$nthreads ) {
my $thr = threads -> create ( \&first_worker );
push ( #first_workers, $thr );
}
for ( 1..$nthreads ) {
my $thr = threads -> create ( \&second_worker );
}
$firstworkitem_q -> enqueue ( #things_to_processs );
$firstworkitem_q -> end;
foreach my $firstworker ( #first_workers ) {
$firstworker -> join();
}
#here all the first workers have finished, so we know nothing will be queued to second work queue.
$secondworkitem_q -> end();
foreach my $thr ( threads -> list() ) {
$thr -> join();
}
You stuff things into the queue, and iterate on it for processing. When you end the queue, the while loop gets an undef and thus terminates - making your thread joinable.
You don't need to track #running the way you do, because threads -> list() will do that. And more importantly - you'd need to make #running a shared variable and lock it, because otherwise you've got a different copy of it in each thread.
Having firstJob spawn secondJob I'd steer away from, because it can create all manner of fruity bugs. I'd suggest spawning two classes of worker thread. Use $queue -> end() to trigger the first set of workers to close.
As for your second question, threads are only joinable if they have finished running (see this answer). Because some of the threads aren't done when the second while loop runs, it finishes without joining them.
Your loop should wait based on the number of active threads, not the number of joinable threads. Something like this:
while (threads->list() > 0)
{
foreach my $joinable (threads->list(threads::joinable))
{
$joinable->join();
}
}
As for the first question, there are certainly other ways to manage threads. However, it is not possible to say what you should do without knowing your task.
I have written the perl script to pause and resume.When the user enters Ctrl+c it has to pause and on pressing c it should resume. But is not working properly as expected. Can anyone help me on this what mistake i am making:
use strict;
use threads;
use threads::shared;
use Thread::Suspend;
use Lens;
$SIG{'INT'} = 'Pause';
#$| = 1;
print chr(7);
my $nthreads = 64;
my #thrs;
for(1..$nthreads)
{
print "START $_ \n";
my ($thr) = threads->create(\&worker, $_);
push #thrs ,$thr;
}
$_->join for #thrs;
exit;
sub worker
{
my $id = shift;
my $tmp;
my $lens = Lens->new("172.16.1.65:2000");
die "cannot create object" unless defined $lens;
die "cannot connect to XRay at " unless defined $lens->open("172.16.1.65:2000");
for(1..100000)
{
print "Thread $id \n";
}
print "$id>LOAD EXIT\n";
}
sub Pause
{
sleep(1);
print "\nCaught ^C\n";
print "Press \"c\" to continue, \"e\" to exit: ";
$_->suspend() for #thrs;
while (1)
{
my $input = lc(getc());
chomp ($input);
if ($input eq 'c') {
#clock($hour,$min,$sec);
$_->resume() for #thrs;
return;
}
elsif ($input eq 'e') {
exit 1;
}
}
}
Well, you haven't been too specific as to how it's "not working properly". But I would suggest looking at using Thread::Semaphore for a 'suspend' mechanism.
I would also suggest not using signal and instead doing something like:
#!/usr/bin/perl
use strict;
use warnings;
use threads;
use Thread::Semaphore;
use Term::ReadKey;
my $nthreads = 64;
my $thread_semaphore = Thread::Semaphore->new($nthreads);
sub worker {
for ( 1 .. 10 ) {
$thread_semaphore->down();
print threads->self->tid(), "\n";
sleep 1;
$thread_semaphore->up();
}
}
for ( 1 .. $nthreads ) {
threads->create( \&worker );
}
my $keypress;
ReadMode 4;
while ( threads->list(threads::running) ) {
while ( not defined( $keypress = ReadKey(-1) )
and threads->list(threads::running) )
{
print "Waiting\nRunning:". threads->list(threads::running) . "\n";
sleep 1;
}
print "Got $keypress\n";
if ( $keypress eq "p" ) {
print "Pausing...";
$thread_semaphore -> down_force($nthreads);
print "All paused\n";
}
if ( $keypress eq "c" ) {
print "Resuming...";
$thread_semaphore -> up ( $nthreads );
}
}
ReadMode 0;
foreach my $thr ( threads->list ) {
$thr->join();
}
It'll 'suspend' by setting the semaphores to zero (or negative) and relies on the threads checking if they should be stopping here or not.
I think the root of your problem though, will probably be signal propagation - your signal handler is global across your threads. You might find configuring $SIG{'INT'} for your threads separately will yield better results. (E.g. set the signal handler to 'IGNORE' at the start of your code, and set specific ones in the thread/main once the threads have been spawned).
I have an array which contains a list of file #arr=(a.txt,b.txt,c.txt);
I am iterating the array and processing the files with foreach loop; each line of the file will generate a sql and will run on the DB server.
I want to create one thread with each line of the file and query the DB. I also want to control the max no of threads at a time running simultaneously.
You can use a Thread::Pool based system. Or any Boss/Worker model based system.
That's just a simple worker model, an ideal scenario. No problem.
use threads;
use Thread::Queue qw( );
use constant NUM_WORKERS => 5;
sub work {
my ($dbh, $job) = #_;
...
}
{
my $q = Thread::Queue->new();
my #threads;
for (1..NUM_WORKERS) {
push #threads, async {
my $dbh = ...;
while (my $job = $q->dequeue())
work($dbh, $job);
}
};
}
while (<>) {
chomp;
$q->enqueue($_);
}
$q->enqueue(undef) for 1..#threads;
$_->join() for #threads;
}
Pass the file names to the script as arguments, or assign them to #ARGV within the script.
local #ARGV = qw( a.txt b.txt c.txt );
Interesting I manually control how many threads to run. I use Hash of the thread id
[code snip]
my %thr; #my hashes for threads
$count=1;
$maxthreads=5;
while (shift (#data) {
$syncthr = threads->create(sub {callfunction here}, {pass variables});
$tid = $syncthr->tid; #get the thread ID
$thr{$tid} = $syncthr;
if ($count >= $maxthreads) {
threads->yield();
while (1) { # loop until threads are completed
$num_run_threads = keys (%thr);
foreach $one_thread ( keys %thr ) {
if ($thr{$one_thread}->is_running() ) { # if thread running check for error state
if ($err = $thr{$one_thread}->error() } {
[ do stuff here]
}
# otherwise move on to next thread check
} else { # thread is either completed or has error
if ($err = $thr{$one_thread}->error()) {
[ check for error again cann't hurt to double check ]
}
if ($err = $thr{$one_thread}->join()) {
print "Thread completed id: [$one_thread]\n";
}
delete $thr{$one_thread}; # delete the hash since the thread is no more
$num_run_threads = $num_run_threads - 1; # reduce the number of running threads
}
} # close foreach loop
#threads = threads->list(threads::running); # get threads
if ($num_run_threads < $maxthreads ) {
$count = $num_run_threads; # reset the counter to number of threads running
if ( $#data != -1 ) { # check to make sure we still have data
last; # exit the infinite while loop
} else {
if (#threads) {
next; # we still have threads continue with processing
} else {
{ no more threads to process exit program or do something else }
}
} # end else
} # end threads running
} # end the while statement
#Check the threads to see if they are joinable
undef #threads;
#threads = threads->joinable()
if (#threads) {
foreach $mthread(#threads) {
if ($mthreads != 0) {
$thr->join();
}
} #end foreach
} #end #threads
} #end the if statement
$count++; Increment the counter to get to number of max threads to spawn
}
This is by no means a complete program. Furthermore, I have changed it to be very bland. However, I've been using this for a while with success. Especially in the OO Perl. This works for me and have quite a lot of uses. I maybe missing a few more error checking especially with timeout but I do that in the thread itself. Which by the way the thread is actually a sub routine that I am calling.
I'm working on a project, implemented in Perl, and thought it would be an idea to use threads to distribute the work, because the tasks can be done independent of each other and only reading from shared data in memory. However, the performance is nowhere near as I expect it to be. So after some investigation I can only conclude that threads in Perl basically suck, but I keep wondering the performance goes down the drain as soon as I implement one single shared variable.
For example, this little program has nothing shared and consumes 75% of the CPU (as expected):
use threads;
sub fib {
my ( $n ) = #_;
if ( $n < 2 ) {
return $n;
} else {
return fib( $n - 1 ) + fib( $n - 2 );
}
}
my $thr1 = threads->create( 'fib', 35 );
my $thr2 = threads->create( 'fib', 35 );
my $thr3 = threads->create( 'fib', 35 );
$thr1->join;
$thr2->join;
$thr3->join;
And as soon as I introduce a shared variable $a, the CPU usage is somewhere between 40% and 50%:
use threads;
use threads::shared;
my $a : shared;
$a = 1000;
sub fib {
my ( $n ) = #_;
if ( $n < 2 ) {
return $n;
} else {
return $a + fib( $n - 1 ) + fib( $n - 2 ); # <-- $a was added here
}
}
my $thr1 = threads->create( 'fib', 35 );
my $thr2 = threads->create( 'fib', 35 );
my $thr3 = threads->create( 'fib', 35 );
$thr1->join;
$thr2->join;
$thr3->join;
So $a is read-only and no locking takes place, and yet the performance decreases. I'm curious why this happens.
At the moment I'm using Perl 5.10.1 under Cygwin on Windows XP. Unfortunately I couldn't test this on a non-Windows machine with a (hopefully) more recent Perl.
Your code is a tight loop around a synchronized structure. Optimize it by having each thread copy the shared variable -- just once for each thread -- into an unshared variable.
Constructing a shared object containing lots of data is possible in Perl and not worry about extra copies. There is no impact to performance when spawning workers, because the shared data resides inside a separate thread or process, depending on whether using threads.
use MCE::Hobo; # use threads okay or parallel module of your choice
use MCE::Shared;
# The module option constructs the object under the shared-manager.
# There's no trace of data inside the main process. The construction
# returns a shared reference containing an id and class name.
my $data = MCE::Shared->share( { module => 'My::Data' } );
my $b;
sub fib {
my ( $n ) = #_;
if ( $n < 2 ) {
return $n;
} else {
return $b + fib( $n - 1 ) + fib( $n - 2 );
}
}
my #thrs;
push #thrs, MCE::Hobo->create( sub { $b = $data->get_keys(1000), fib(35) } );
push #thrs, MCE::Hobo->create( sub { $b = $data->get_keys(2000), fib(35) } );
push #thrs, MCE::Hobo->create( sub { $b = $data->get_keys(3000), fib(35) } );
$_->join() for #thrs;
exit;
# Populate $self with data. When shared, the data resides under the
# shared-manager thread (via threads->create) or process (via fork).
package My::Data;
sub new {
my $class = shift;
my %self;
%self = map { $_ => $_ } 1000 .. 5000;
bless \%self, $class;
}
# Add any getter methods to suit the application. Supporting multiple
# keys helps reduce the number of trips via IPC. Serialization is
# handled automatically if getter method were to return a hash ref.
# MCE::Shared will use Serial::{Encode,Decode} if available - faster.
sub get_keys {
my $self = shift;
if ( wantarray ) {
return map { $_ => $self->{$_} } #_;
} else {
return $self->{$_[0]};
}
}
1;