Use Logstash with HTML log - logstash

I'm new to Logstash, trying to use it to parse a HTML log file.
I need to output only the log lines, i.e. ignore preceding JS, CSS and HTML that are also included in the file.
A log line in the file looks like this:
<tr bgcolor="tomato"><td>Jan 28<br>13:52:25.692</td><td>Jan 28<br>13:52:23.950</td><td>qtp114615276-1648 [POST] [call_id:-8009072655119858507]</td><td>REST</td><td>sa</td><td>0.0.0.0</td><td>ERR</td><td>ProjectValidator.validate(36)</td><td>Project does not exist</td></tr>
I have no problem getting all the lines, but I would like to have an output which contains only the relevant ones, without HTML tags, and looks something like that:
{
"db_timestamp": "2015-01-28 13:52:25.692",
"server_timestamp": "2015-01-28 13:52:25.950",
"node": "qtp114615276-1648 [POST] [call_id:-8009072655119858507]",
"thread": "REST",
"user": "sa",
"ip": "0.0.0.0",
"level": "ERR",
"method": "ProjectValidator.validate(36)",
"message": "Project does not exist"
}
My Logstash configuration is:
input {
file {
type => "request"
path => "<some path>/*.log"
start_position => "beginning"
}
file {
type => "log"
path => "<some path>/*.html"
start_position => "beginning"
}
}
filter {
if [type] == "log" {
grok {
match => [ WHAT SHOULD I PUT HERE??? ]
}
}
}
output {
stdout {}
if [type] == "request" {
http {
http_method => "post"
url => "http://<some url>"
mapping => ["type", "request", "host" ,"%{host}", "timestamp", "%{#timestamp}", "message", "%{message}"]
}
}
if [type] == "log" {
http {
http_method => "post"
url => "http://<some url>"
mapping => [ ALSO WHAT SHOULD I PUT HERE??? ]
}
}
}
Is there a way to do that? So far I haven't found any relevant documentation or samples.
Thanks!

Finally figured out the answer.
Not sure this is the best or most elegant solution, but it works.
I changed the http output format to "message", which enabled me to override and format the whole message as JSON, instead of using mapping. Also, found out how to name parameters in the grok filter and use them in the output.
This is the new Logstash configuration file:
input {
file {
type => "request"
path => "<some path>/*.log"
start_position => "beginning"
}
file {
type => "log"
path => "<some path>/*.html"
start_position => "beginning"
}
}
filter {
if [type] == "log" {
grok {
match => { "message" => "<tr bgcolor=.*><td>%{MONTH:db_date}%{SPACE}%{MONTHDAY:db_date}<br>%{TIME:db_date}</td><td>%{MONTH:alm_date}%{SPACE}%{MONTHDAY:alm_date}<br>%{TIME:alm_date}</td><td>%{DATA:thread}</td><td>%{DATA:req_type}</td><td>%{DATA:username}</td><td>%{IP:ip}</td><td>%{DATA:level}</td><td>%{DATA:method}</td><td>%{DATA:err_message}</td></tr>" }
}
}
}
output { stdout { codec => rubydebug }
if [type] == "request" {
http {
http_method => "post"
url => "http://<some URL>"
mapping => ["type", "request", "host" ,"%{host}", "timestamp", "%{#timestamp}", "message", "%{message}"]
}
}
if [type] == "log" {
http {
format => "message"
content_type => "application/json"
http_method => "post"
url => "http://<some URL>"
message=> '{
"db_date":"%{db_date}",
"alm_date":"%{alm_date}",
"thread": "%{thread}",
"req_type": "%{req_type}",
"username": "%{username}",
"ip": "%{ip}",
"level": "%{level}",
"method": "%{method}",
"message": "%{err_message}"
}'
}
}
}
Note the single quote for the http message block and the double quotes for the parameters inside this block.

For anyone parsing HP ALM logs, the following Logstash filter will do the work:
grok {
break_on_match => true
match => [ "message", "<tr bgcolor=.*><td>%{MONTH:db_date_mon}%{SPACE}%{MONTHDAY:db_date_day}<br>%{TIME:db_date_time}<\/td><td>%{MONTH:alm_date_mon}%{SPACE}%{MONTHDAY:alm_date_day}<br>%{TIME:alm_date_time}<\/td><td>(?<thread_col1>.*?)<\/td><td>(?<request_type>.*?)<\/td><td>(?<login>.*?)<\/td><td>(?<ip>.*?)<\/td><td>(?<level>.*?)<\/td><td>(?<method>.*?)<\/td><td>(?m:(?<log_message>.*?))</td></tr>" ]
}
mutate {
add_field => ["db_date", "%{db_date_mon} %{db_date_day}"]
add_field => ["alm_date", "%{alm_date_mon} %{alm_date_day}"]
remove_field => [ "db_date_mon", "db_date_day", "alm_date_mon", "alm_date_day" ]
gsub => [
"log_message", "<br>", "
"
]
gsub => [
"log_message", "<p>", " "
]
}
Tested and working fine with Logstash 2.4.0

Related

LogStash Conf | Drop Empty Lines

The contents of LogStash's conf file looks like this:
input {
beats {
port => 5044
}
file {
path => "/usr/share/logstash/iway_logs/*"
start_position => "beginning"
sincedb_path => "/dev/null"
#ignore_older => 0
codec => multiline {
pattern => "^\[%{NOTSPACE:timestamp}\]"
negate => true
what => "previous"
max_lines => 2500
}
}
}
filter {
grok {
match => { "message" =>
['(?m)\[%{NOTSPACE:timestamp}\]%{SPACE}%{WORD:level}%{SPACE}\(%{NOTSPACE:entity}\)%{SPACE}%{GREEDYDATA:rawlog}'
]
}
}
date {
match => [ "timestamp", "yyyy-MM-dd'T'HH:mm:ss.SSS"]
target => "#timestamp"
}
grok {
match => { "entity" => ['(?:W.%{GREEDYDATA:channel}:%{GREEDYDATA:inlet}:%{GREEDYDATA:listener}\.%{GREEDYDATA:workerid}|W.%{GREEDYDATA:channel}\.%{GREEDYDATA:workerid}|%{GREEDYDATA:channel}:%{GREEDYDATA:inlet}:%{GREEDYDATA:listener}\.%{GREEDYDATA:workerid}|%{GREEDYDATA:channel}:%{GREEDYDATA:inlet}:%{GREEDYDATA:listener}|%{GREEDYDATA:channel})']
}
}
dissect {
mapping => {
"[log][file][path]" => "/usr/share/logstash/iway_logs/%{serverName}#%{configName}#%{?ignore}.log"
}
}
}
output {
elasticsearch {
hosts => "${ELASTICSEARCH_HOST_PORT}"
index => "iway_"
user => "${ELASTIC_USERNAME}"
password => "${ELASTIC_PASSWORD}"
ssl => true
ssl_certificate_verification => false
cacert => "/certs/ca.crt"
}
}
As one can make out, the idea is to parse a custom log employing multiline extraction. The extraction does its job. The log occasionally contains an empty first line. So:
[2022-11-29T12:23:15.073] DEBUG (manager) Generic XPath iFL functions use full XPath 1.0 syntax
[2022-11-29T12:23:15.074] DEBUG (manager) XPath 1.0 iFL functions use iWay's full syntax implementation
which naturally is causing Kibana to report an empty line:
In an attempt to supress this line from being sent to ES, I added the following as a last filter item:
if ![message] {
drop { }
}
if [message] =~ /^\s*$/ {
drop { }
}
The resulting JSON payload to ES:
{
"#timestamp": [
"2022-12-09T14:09:35.616Z"
],
"#version": [
"1"
],
"#version.keyword": [
"1"
],
"event.original": [
"\r"
],
"event.original.keyword": [
"\r"
],
"host.name": [
"xxx"
],
"host.name.keyword": [
"xxx"
],
"log.file.path": [
"/usr/share/logstash/iway_logs/localhost#iCLP#iway_2022-11-29T12_23_33.log"
],
"log.file.path.keyword": [
"/usr/share/logstash/iway_logs/localhost#iCLP#iway_2022-11-29T12_23_33.log"
],
"message": [
"\r"
],
"message.keyword": [
"\r"
],
"tags": [
"_grokparsefailure"
],
"tags.keyword": [
"_grokparsefailure"
],
"_id": "oRc494QBirnaojU7W0Uf",
"_index": "iway_",
"_score": null
}
While this does drop the empty first line, it also unfortunately interferes with the multiline operation on other lines. In other words, the multiline operation does not work anymore. What am I doing incorrectly?
Use of the following variation resolved the issue:
if [message] =~ /\A\s*\Z/ {
drop { }
}
This solution is based on Badger's answer provided on the Logstash forums, where this question was raised as well.

Grok help for a custom metric

I have a log line like this:
09 Nov 2018 15:51:35 DEBUG api.MapAnythingProvider - Calling API For Client: XXX Number of ELEMENTS Requested YYY
I want to ignore all other log lines and only want those lines that have the words "Calling API For Client" in it. Further, I am only interested in the String XXX and Number YYY.
Thanks for the help.
input {
file {
path => ["C:/apache-tomcat-9.0.7/logs/service/service.log"]
sincedb_path => "nul"
start_position => "beginning"
}
}
filter {
grok {
match => {
"message" => "%{MONTHDAY:monthDay} %{MONTH:mon} %{YEAR:year} %{TIME:ts} %{WORD:severity} %{JAVACLASS:claz} - %{GREEDYDATA:logmessage}"
}
}
grok {
match => {
"logmessage" => "%{WORD:keyword} %{WORD:customer} %{WORD:key2} %{NUMBER:mapAnythingCreditsConsumed:float} %{WORD:key3} %{NUMBER:elementsFromCache:int}"
}
}
if "_grokparsefailure" in [tags] {
drop {}
}
mutate {
remove_field => [ "monthDay", "mon", "ts", "severity", "claz", "keyword", "key2", "path", "message", "year", "key3" ]
}
}
output {
if [logmessage] =~ /ExecutingJobFor/ {
elasticsearch {
hosts => ["localhost:9200"]
index => "test"
manage_template => false
}
stdout {
codec => rubydebug
}
}
}

Can't grok multiline logs

I have logs where each event is:
ExitNode FF33F91CC06B6CC5C3EE804E7D8DBE42CB5707F9
Published 2017-11-05 02:55:09
LastStatus 2017-11-05 04:02:27
ExitAddress 66.42.224.235 2017-11-05 04:06:26
I tried to use multiline:
input {
file {
path => "/path/input"
}
}
filter {
multiline {
pattern => "^\b[A-Za-z]{8}\b"
what => "next"
}
}
filter {
multiline {
pattern => "^\b[A-Za-z]{8}\b"
what => "next"
}
}
filter {
multiline {
pattern => "^\b[A-Za-z]{11}\b"
what => "previous"
}
}
output {
file {
codec => rubydebug
path => "/path/output"
}
}
And I get something like this:
{
"path" => "/path/input",
"#timestamp" => 2017-11-05T10:25:34.112Z,
"#version" => "1",
"host" => "HOST",
"message" => "ExitNode FE3CB742E73674F1BC2382723209ECEE44AD4AEC\nPublished 2017-11-04 20:34:55\nLastStatus 2017-11-04 21:03:26\nExitAddress 77.250.227.12 2017-11-04 21:06:45",
"tags" => [
[0] "multiline"
]
}
And I can't grok this message field because I don't know how to remove or replace \n and gsub => ["message", "\n", "Line_Break"] doesn't work properly.
Thanks
From the comment of #baudsp:
mutate {
gsub =>
["message", "[\r\n]","_"]
}

Logstash issue with json_formater [LogStash::Json::ParserError: Unexpected character ('-' (code 45)): was expecting comma to separate ARRAY entries]

I have an issue with converting value through logstash, I can't find solution for it. it seems to be linked to the date.
#Log line
[2017-08-15 12:30:17] api.INFO: {"sessionId":"a216925---ff5992be7520924ff25992be75209c7","action":"processed","time":1502789417,"type":"bookingProcess","page":"order"} [] []
Logstash configuration
filter {
if [type] == "api-prod-log" {
grok {
match => {"message" => "\[%{TIMESTAMP_ISO8601:timestamp}\] %{WORD:module}.%{WORD:level}: (?<log_message>.*) \[\] \[\]" }
add_field => [ "received_from", "%{host}" ]
}
json {
source => "log_message"
target => "flightSearchRequest"
remove_field=>["log_message"]
}
date {
match => [ "timestamp", "YYYY-MM-dd HH:mm:ss" ]
timezone => "Asia/Jerusalem"
}
}
}
Any idea ?
Thanks
What version of Logstash are you using?
On Logstash 5.2.2 with the following Logstash config:
input {
stdin{}
}
filter {
grok {
match => {"message" => '\[%{TIMESTAMP_ISO8601:timestamp}\] %{WORD:module}.%{WORD:level}: (?<log_message>.*) \[\] \[\]' }
}
json {
source => "log_message"
target => "flightSearchRequest"
remove_field=>["log_message"]
}
date {
match => [ "timestamp", "YYYY-MM-dd HH:mm:ss" ]
timezone => "Asia/Jerusalem"
}
}
output{
stdout {codec => "rubydebug"}
}
I get a perfectly correct result and no errors, when I pass your log line as input:
{
"#timestamp" => 2017-08-15T09:30:17.000Z,
"flightSearchRequest" => {
"action" => "processed",
"sessionId" => "a216925---ff5992be7520924ff25992be75209c7",
"time" => 1502789417,
"page" => "order",
"type" => "bookingProcess"
},
"level" => "INFO",
"module" => "api",
"#version" => "1",
"message" => "[2017-08-15 12:30:17] api.INFO: {\"sessionId\":\"a216925---ff5992be7520924ff25992be75209c7\",\"action\":\"processed\",\"time\":1502789417,\"type\":\"bookingProcess\",\"page\":\"order\"} [] []",
"timestamp" => "2017-08-15 12:30:17"
}
I've removed the check for "type" in the beginning, can you test if that can affect the result?

File input add_field not adding field to every row

I am parsing several logfiles of different load balanced serverclusters with my logstash config and would like to add a field "log_origin" to each file's entries for the later easy filtering.
Here's my input->file config in a simple example:
input {
file {
type => "node1"
path => "C:/Development/node1/log/*"
add_field => [ "log_origin", "live_logs" ]
}
file {
type => "node2"
path => "C:/Development/node2/log/*"
add_field => [ "log_origin", "live_logs" ]
}
file {
type => "node3"
path => "C:/Development/node1/log/*"
add_field => [ "log_origin", "live_logs" ]
}
file {
type => "node4"
path => "C:/Development/node1/log/*"
add_field => [ "log_origin", "live_logs" ]
}
}
filter {
grok {
match => [
"message","%{DATESTAMP:log_timestamp}%{SPACE}\[%{DATA:class}\]%{SPACE}%{LOGLEVEL:loglevel}%{SPACE}%{GREEDYDATA:log_message}"
]
}
date {
match => [ "log_timestamp", "dd.MM.YY HH:mm:ss", "ISO8601" ]
target => "#timestamp"
}
mutate {
lowercase => ["loglevel"]
strip => ["loglevel"]
}
if "_grokparsefailure" in [tags] {
multiline {
pattern => ".*"
what => "previous"
}
}
if[fields.log_origin] == "live_logs"{
if [type] == "node1" {
mutate {
add_tag => "realsServerName1"
}
}
if [type] == "node2" {
mutate {
add_tag => "realsServerName2"
}
}
if [type] == "node3" {
mutate {
add_tag => "realsServerName3"
}
}
if [type] == "node4" {
mutate {
add_tag => "realsServerName4"
}
}
}
}
output {
stdout { }
elasticsearch { embedded => true }
}
I would have expected logstash to add this field with the value given to every logentry it finds, but it doesn't. Maybe I am completely taking the wrong approach here?
Edit: I am not able to retrieve the logs directly from the nodes, but have to copy them over to my "server". Otherwise i would be able to just use the filepath for distinguishing different clusters...
Edit: It's working. I should have cleand my data in between. Old entries without the field added cluttered up my results.
The add_field expects a hash. It should be
add_field => {
"log_origin" => "live_logs"
}

Resources