([^<]+)<\/title>/i; $title //= 'No title'; say "Title: " . decode_entities($title); # Extract all links my @links = $html =~ /<a\s+(?:[^>]*?\s+)?href=["']([^"']+)["']/gi; say "\nFound " . scalar(@links) . " links:"; # Process and display unique links my %seen; for my $link (@links) { next if $seen{$link}++; next if $link =~ /^#/; # Skip anchors # Make relative URLs absolute if ($link !~ /^https?:\/\//) { if ($link =~ /^\//) { # Absolute path my ($base) = $url =~ /(https?:\/\/[^\/]+)/; $link = "$base$link"; } else { # Relative path my ($base) = $url =~ /(.*\/)/; $link = "$base$link"; } } say " - $link"; } # Extract meta tags say "\nMeta tags:"; while ($html =~ /<meta\s+([^>]+)>/gi) { my $meta = $1; my ($name) = $meta =~ /name=["']([^"']+)["']/i; my ($content) = $meta =~ /content=["']([^"']+)["']/i; if ($name && $content) { say " $name: " . decode_entities($content); } } # Extract all email addresses (naive pattern) my @emails = $html =~ /\b([\w\.\-]+@[\w\.\-]+\.\w+)\b/g; if (@emails) { say "\nEmail addresses found:"; my %unique_emails; @unique_emails{@emails} = (); say " - $_" for sort keys %unique_emails; } ``` ## Debugging Regular Expressions ### The use re 'debug' Pragma ```perl use re 'debug'; "test string" =~ /test.*string/; # This will output the regex compilation and execution process # Great for understanding why a regex isn't matching ``` ### Building Regexes Incrementally ```perl # Start simple and build up my $regex = qr/\d+/; # Match numbers $regex = qr/\d+\.\d+/; # Match decimals $regex = qr/\d+(?:\.\d+)?/; # Optional decimal part $regex = qr/^\d+(?:\.\d+)?$/; # Anchor to whole string # Test at each stage my @test_cases = qw(123 123.45 .45 123. abc); for my $test (@test_cases) { if ($test =~ $regex) { say "$test matches"; } else { say "$test doesn't match"; } } ``` ## Common Gotchas and Solutions ### The Greediness Problem ```perl my $xml = '<tag>content</tag><tag>more</tag>'; # Wrong: Greedy matching $xml =~ /<tag>.*<\/tag>/; # Matches entire string! # Right: Non-greedy $xml =~ /<tag>.*?<\/tag>/; # Matches first tag pair # Better: Explicit $xml =~ /<tag>[^<]*<\/tag>/; # Most efficient ``` ### The Anchor Trap ```perl # Dangerous: No anchors if ($input =~ /\d{3}/) { # Matches "abc123def" - probably not intended! } # Safe: With anchors if ($input =~ /^\d{3}$/) { # Only matches exactly 3 digits } ``` ### Special Characters in Variables ```perl my $user_input = "What???"; # Wrong: ? is a regex metacharacter if ($text =~ /$user_input/) { # Error! # Right: Quote metacharacters if ($text =~ /\Q$user_input\E/) { # Treats ??? as literal ``` ## Best Practices 1. **Comment complex regexes** - Use /x modifier liberally 2. **Name your captures** - $+{name} is clearer than $3 3. **Compile once when possible** - Use qr// for repeated patterns 4. **Test incrementally** - Build complex patterns step by step 5. **Consider alternatives** - Sometimes a parser is better than a regex 6. **Anchor when appropriate** - Prevent unexpected matches 7. **Be careful with user input** - Always use \Q...\E for literal matching ## The Zen of Perl Regexes Regular expressions in Perl aren't just a feature—they're a philosophy. They embody Perl's core principle: make easy things easy and hard things possible. Yes, you can write unreadable regex golf. But you can also write clear, maintainable patterns that solve real problems elegantly. The key is knowing when to use them. Not every text processing task needs a regex. But when you do need one, Perl ensures you have the full power of regular expressions at your fingertips, integrated seamlessly into the language. --- *Next up: File I/O and directory operations. We'll see how Perl's "Do What I Mean" philosophy extends to file handling, and why Perl remains a favorite for system administrators who need to process thousands of files efficiently.*

(.*?)<\/div>/s; # Captures across newlines # /m - Multi-line mode (^ and $ match line boundaries) my $multi = "Line 1\nLine 2\nLine 3"; my @lines = $multi =~ /^Line \d+$/gm; # /g - Global matching my $data = "cat bat rat"; my @words = $data =~ /\w+/g; # ('cat', 'bat', 'rat') # /o - Compile pattern once (optimization for loops) for my $line (@huge_file) { $line =~ /$pattern/o; # Pattern compiled only once } ``` ## Advanced Pattern Matching ### Non-Capturing Groups ```perl # (?:...) doesn't create a capture variable my $url = "https://www.example.com:8080/path"; if ($url =~ /^(https?):\/\/(?:www\.)?([^:\/]+)(?::(\d+))?/) { my ($protocol, $domain, $port) = ($1, $2, $3); $port //= $protocol eq 'https' ? 443 : 80; say "Protocol: $protocol, Domain: $domain, Port: $port"; } ``` ### Named Captures (Perl 5.10+) ```perl # Much more readable than $1, $2, $3... my $log_line = '2024-01-15 10:30:45 [ERROR] Connection timeout'; if ($log_line =~ / (?\d{4}-\d{2}-\d{2})\s+ (?\d{2}:\d{2}:\d{2})\s+ \[(?\w+)\]\s+ (?.+) /x) { say "Date: $+{date}"; say "Time: $+{time}"; say "Level: $+{level}"; say "Message: $+{message}"; } ``` ### Lookahead and Lookbehind ```perl # Positive lookahead (?=...) # Match 'test' only if followed by 'ing' "testing tested" =~ /test(?=ing)/; # Matches 'test' in 'testing' # Negative lookahead (?!...) # Match 'test' only if NOT followed by 'ing' "testing tested" =~ /test(?!ing)/; # Matches 'test' in 'tested' # Positive lookbehind (?<=...) # Match numbers preceded by '$' "Price: $50, €50" =~ /(?<=\$)\d+/; # Matches '50' after '$' # Negative lookbehind (?[\d,]+\.?\d*)/) { my $price = $+{price}; $price =~ s/,//g; # Remove commas say "Price: $price"; # Price: 1234.56 } ``` ### Recursive Patterns Perl can match nested structures: ```perl # Match balanced parentheses my $balanced = qr/ $ # Opening paren (?: [^()]+ # Non-parens | (?R) # Recurse entire pattern )* $ # Closing paren /x; my $text = "func(a, b(c, d(e)), f)"; say "Balanced!" if $text =~ /^func$balanced$/; ``` ## Real-World Regex Patterns ### Email Validation (Simplified) ```perl # This is simplified. Real email validation is complex! my $email_regex = qr/ ^ # Start [\w\.\-]+ # Local part \@ # At sign [\w\-]+ # Domain name (?:\.[\w\-]+)+ # Domain extensions $ # End /x; my @emails = qw( user@example.com john.doe@company.co.uk invalid@ @invalid.com valid+tag@gmail.com ); for my $email (@emails) { if ($email =~ $email_regex) { say "$email is valid"; } else { say "$email is invalid"; } } ``` ### Log File Parsing ```perl # Apache/Nginx log parser my $log_regex = qr/ ^ (?[\d\.]+)\s+ # IP address (?\S+)\s+ # Identity (?\S+)\s+ # User \[(?[^\]]+)\]\s+ # Timestamp "(?[^"]+)"\s+ # Request (?\d{3})\s+ # Status code (?\d+|-)\s* # Response size "(?[^"]*)"\s* # Referer "(?[^"]*)" # User agent /x; while (my $line = <$log_fh>) { next unless $line =~ $log_regex; my %entry = %+; # Copy all named captures # Process the log entry if ($entry{status} >= 500) { warn "Server error: $entry{request} returned $entry{status}"; } # Extract more info from request if ($entry{request} =~ /^(?\S+)\s+(?\S+)\s+(?\S+)/) { $entry{method} = $+{method}; $entry{path} = $+{path}; } } ``` ### Configuration File Parser ```perl # Parse INI-style config files sub parse_config { my ($filename) = @_; my %config; my $section = 'DEFAULT'; open my $fh, '<', $filename or die "Can't open $filename: $!"; while (my $line = <$fh>) { chomp $line; # Skip comments and empty lines next if $line =~ /^\s*(?:#|$)/; # Section header if ($line =~ /^\[([^\]]+)\]/) { $section = $1; next; } # Key-value pair if ($line =~ / ^\s* ([^=]+?) # Key (non-greedy) \s*=\s* # Equals with optional whitespace (.*) # Value $ /x) { my ($key, $value) = ($1, $2); # Remove quotes if present $value =~ s/^["']|["']$//g; # Store in config $config{$section}{$key} = $value; } } close $fh; return \%config; } ``` ## Performance and Optimization ### Compile Once, Use Many ```perl # Bad: Regex compiled every iteration for my $line (@lines) { if ($line =~ /$user_pattern/) { # Compiles each time process($line); } } # Good: Pre-compile regex my $regex = qr/$user_pattern/; for my $line (@lines) { if ($line =~ $regex) { # Already compiled process($line); } } # Better: Use state for persistent compiled regex sub match_pattern { my ($text, $pattern) = @_; state %compiled; $compiled{$pattern} //= qr/$pattern/; return $text =~ $compiled{$pattern}; } ``` ### Avoiding Backtracking ```perl # Bad: Catastrophic backtracking possible $text =~ /(\w+)*$/; # Nested quantifiers # Good: Possessive quantifiers prevent backtracking $text =~ /(\w++)*$/; # ++ is possessive # Bad: Greedy matching with backtracking $html =~ /

.*<\/div>/; # Matches too much # Good: Non-greedy matching $html =~ /

.*?<\/div>/; # *? is non-greedy # Better: Explicit matching $html =~ /

[^<]*<\/div>/; # More efficient ``` ## Practical Script: Web Scraper Let's build a simple web scraper using Perl's regex powers: ```perl #!/usr/bin/env perl use Modern::Perl '2018'; use LWP::Simple; use HTML::Entities; # Fetch a web page and extract information my $url = shift @ARGV or die "Usage: $0 \n"; my $html = get($url) or die "Couldn't fetch $url\n"; # Remove script and style blocks $html =~ s/)<[^<]*)*<\/script>//gis; $html =~ s/)<[^<]*)*<\/style>//gis; # Extract title my ($title) = $html =~ /([^<]+)<\/title>/i; $title //= 'No title'; say "Title: " . decode_entities($title); # Extract all links my @links = $html =~ /<a\s+(?:[^>]*?\s+)?href=["']([^"']+)["']/gi; say "\nFound " . scalar(@links) . " links:"; # Process and display unique links my %seen; for my $link (@links) { next if $seen{$link}++; next if $link =~ /^#/; # Skip anchors # Make relative URLs absolute if ($link !~ /^https?:\/\//) { if ($link =~ /^\//) { # Absolute path my ($base) = $url =~ /(https?:\/\/[^\/]+)/; $link = "$base$link"; } else { # Relative path my ($base) = $url =~ /(.*\/)/; $link = "$base$link"; } } say " - $link"; } # Extract meta tags say "\nMeta tags:"; while ($html =~ /<meta\s+([^>]+)>/gi) { my $meta = $1; my ($name) = $meta =~ /name=["']([^"']+)["']/i; my ($content) = $meta =~ /content=["']([^"']+)["']/i; if ($name && $content) { say " $name: " . decode_entities($content); } } # Extract all email addresses (naive pattern) my @emails = $html =~ /\b([\w\.\-]+@[\w\.\-]+\.\w+)\b/g; if (@emails) { say "\nEmail addresses found:"; my %unique_emails; @unique_emails{@emails} = (); say " - $_" for sort keys %unique_emails; } ``` ## Debugging Regular Expressions ### The use re 'debug' Pragma ```perl use re 'debug'; "test string" =~ /test.*string/; # This will output the regex compilation and execution process # Great for understanding why a regex isn't matching ``` ### Building Regexes Incrementally ```perl # Start simple and build up my $regex = qr/\d+/; # Match numbers $regex = qr/\d+\.\d+/; # Match decimals $regex = qr/\d+(?:\.\d+)?/; # Optional decimal part $regex = qr/^\d+(?:\.\d+)?$/; # Anchor to whole string # Test at each stage my @test_cases = qw(123 123.45 .45 123. abc); for my $test (@test_cases) { if ($test =~ $regex) { say "$test matches"; } else { say "$test doesn't match"; } } ``` ## Common Gotchas and Solutions ### The Greediness Problem ```perl my $xml = '<tag>content</tag><tag>more</tag>'; # Wrong: Greedy matching $xml =~ /<tag>.*<\/tag>/; # Matches entire string! # Right: Non-greedy $xml =~ /<tag>.*?<\/tag>/; # Matches first tag pair # Better: Explicit $xml =~ /<tag>[^<]*<\/tag>/; # Most efficient ``` ### The Anchor Trap ```perl # Dangerous: No anchors if ($input =~ /\d{3}/) { # Matches "abc123def" - probably not intended! } # Safe: With anchors if ($input =~ /^\d{3}$/) { # Only matches exactly 3 digits } ``` ### Special Characters in Variables ```perl my $user_input = "What???"; # Wrong: ? is a regex metacharacter if ($text =~ /$user_input/) { # Error! # Right: Quote metacharacters if ($text =~ /\Q$user_input\E/) { # Treats ??? as literal ``` ## Best Practices 1. **Comment complex regexes** - Use /x modifier liberally 2. **Name your captures** - $+{name} is clearer than $3 3. **Compile once when possible** - Use qr// for repeated patterns 4. **Test incrementally** - Build complex patterns step by step 5. **Consider alternatives** - Sometimes a parser is better than a regex 6. **Anchor when appropriate** - Prevent unexpected matches 7. **Be careful with user input** - Always use \Q...\E for literal matching ## The Zen of Perl Regexes Regular expressions in Perl aren't just a feature—they're a philosophy. They embody Perl's core principle: make easy things easy and hard things possible. Yes, you can write unreadable regex golf. But you can also write clear, maintainable patterns that solve real problems elegantly. The key is knowing when to use them. Not every text processing task needs a regex. But when you do need one, Perl ensures you have the full power of regular expressions at your fingertips, integrated seamlessly into the language. --- *Next up: File I/O and directory operations. We'll see how Perl's "Do What I Mean" philosophy extends to file handling, and why Perl remains a favorite for system administrators who need to process thousands of files efficiently.*