Related
I have an input file with billions of records and a header.
Header consists of meta info, total number of rows and sum of the sixth column. I am splitting the file into small sizes, due to which my header record must be updated as the sum of sixth column and total rows is changed.
This is the sample record
filename: testFile.text
00|STMT|08-09-2022 13:24:56||5|13.10|SHA2
10|000047290|8ddcf4b2356dfa7f326ca8004a9bdb6096330fc4f3b842a971deaf660a395f65|18-01-2020|12:36:57|3.10|00004729018-01-20201|APP
10|000052736|cce280392023b23df2a00ace4b82db8eb61c112bb14509fb273c523550059317|07-02-2017|16:27:49|2.00|00005273607-02-20171|APP
10|000070355|f2e86d2731d32f9ce960a0f5883e9b688c7e57ab9c2ead86057f98426407d87a|17-07-2019|20:25:02|1.00|00007035517-07-20192|APP
10|000070355|54c1fc2667e160a11ae1dbf54d3ba993475cd33d6ececdd555fb5c07e64a241b|17-07-2019|20:25:02|5.00|00007035517-07-20192|APP
10|000072420|f5dac143082631a1693e0fb5429d3a185abcf3c47b091be2f30cd50b5cf4be11|14-06-2021|20:52:21|2.00|00007242014-06-20212|APP
Expected:
filename: testFile_1.text
00|STMT|08-09-2022 13:24:56||3|6.10|SHA2
10|000047290|8ddcf4b2356dfa7f326ca8004a9bdb6096330fc4f3b842a971deaf660a395f65|18-01-2020|12:36:57|3.10|00004729018-01-20201|APP
10|000052736|cce280392023b23df2a00ace4b82db8eb61c112bb14509fb273c523550059317|07-02-2017|16:27:49|2.00|00005273607-02-20171|APP
10|000070355|f2e86d2731d32f9ce960a0f5883e9b688c7e57ab9c2ead86057f98426407d87a|17-07-2019|20:25:02|1.00|00007035517-07-20192|APP
filename: testFile_2.text
00|STMT|08-09-2022 13:24:56||2|7.00|SHA2
10|000070355|54c1fc2667e160a11ae1dbf54d3ba993475cd33d6ececdd555fb5c07e64a241b|17-07-2019|20:25:02|5.00|00007035517-07-20192|APP
10|000072420|f5dac143082631a1693e0fb5429d3a185abcf3c47b091be2f30cd50b5cf4be11|14-06-2021|20:52:21|2.00|00007242014-06-20212|APP
I am able to split the file and calculate the sum but unable to replace the value in header part.
This is the script I have made
#!/bin/bash
splitRowCount=$1
transactionColumn=$2
filename=$(basename -- "$3")
extension="${filename##*.}"
nameWithoutExt="${filename%.*}"
echo "splitRowCount: $splitRowCount"
echo "transactionColumn: $transactionColumn"
awk 'NR == 1 { head = $0 } NR % '$splitRowCount' == 2 { filename = "'$nameWithoutExt'_" int((NR-1)/'$splitRowCount')+1 ".'$extension'"; print head > filename } NR != 1 { print >> filename }' $filename
ls *.txt | while read line
do
firstLine=$(head -n 1 $line);
awk -F '|' 'NR !=1 {sum += '$transactionColumn'}END {print sum} ' $line
done
Here's an awk solution for splitting the original file into files of n records. The idea is to accumulate the records until the given count is reached then generate a file with the updated header and the accumulated records:
n=3
file=./testFile.text
awk -v numRecords="$n" '
BEGIN {
FS = OFS = "|"
if ( match(ARGV[1],/[^\/]\.[^\/]*$/) ) {
filePrefix = substr(ARGV[1],1,RSTART)
fileSuffix = substr(ARGV[1],RSTART+1)
} else {
filePrefix = ARGV[1]
fileSuffix = ""
}
if (getline headerStr <= 0)
exit 1
split(headerStr, headerArr)
}
(NR-2) % numRecords == 0 && recordsCount {
outfile = filePrefix "_" ++filesCount fileSuffix
print headerArr[1],headerArr[2],headerArr[3],headerArr[4],recordsCount,recordsSum,headerArr[7] > outfile
printf("%s", records) > outfile
close(outfile)
records = ""
recordsCount = recordsSum = 0
}
{
records = records $0 ORS
recordsCount++
recordsSum += $6
}
END {
if (recordsCount) {
outfile = filePrefix "_" ++filesCount fileSuffix
print headerArr[1],headerArr[2],headerArr[3],headerArr[4],recordsCount,recordsSum,headerArr[7] > outfile
printf("%s", records) > outfile
close(outfile)
}
}
' "$file"
With the given sample you'll get:
testFile_1.text
00|STMT|08-09-2022 13:24:56||3|6.1|SHA2
10|000047290|8ddcf4b2356dfa7f326ca8004a9bdb6096330fc4f3b842a971deaf660a395f65|18-01-2020|12:36:57|3.10|00004729018-01-20201|APP
10|000052736|cce280392023b23df2a00ace4b82db8eb61c112bb14509fb273c523550059317|07-02-2017|16:27:49|2.00|00005273607-02-20171|APP
10|000070355|f2e86d2731d32f9ce960a0f5883e9b688c7e57ab9c2ead86057f98426407d87a|17-07-2019|20:25:02|1.00|00007035517-07-20192|APP
testFile_2.text
00|STMT|08-09-2022 13:24:56||2|7|SHA2
10|000070355|54c1fc2667e160a11ae1dbf54d3ba993475cd33d6ececdd555fb5c07e64a241b|17-07-2019|20:25:02|5.00|00007035517-07-20192|APP
10|000072420|f5dac143082631a1693e0fb5429d3a185abcf3c47b091be2f30cd50b5cf4be11|14-06-2021|20:52:21|2.00|00007242014-06-20212|APP
With your shown samples please try following awk code(Written and tested in GNU awk). Here I have defined awk variables named fileInitials which contains your output file's initial name eg: testFile then extension which contains output file's extension eg: .txt here. Then comes lines which will be your value on how many lines you want to have in a output file.
You need not to run shell + awk code, this could be done in a single awk like shown following.
awk -v count="1" -v fileInitials="testFile" -v extension=".txt" -v lines="3" '
BEGIN { FS=OFS="|" }
FNR==1{
match($0,/^([^|]*\|[^|]*\|[^|]*\|[^|]*\|[^|]*)\|[^|]*(.*)/,arr)
header1=arr[1]
header2=arr[2]
outputFile=(fileInitials count extension)
next
}
{
if(prev!=count){
print (header1,sum header2 ORS val) > (outputFile)
close(outputFile)
outputFile=(fileInitials count extension)
sum=0
val=""
}
sum+=$6
val=(val?val ORS:"") $0
prev=count
count=(++countline%lines==0?++count:count)
}
END{
if(count && val){
print (header1,sum header2 ORS val) > (outputFile)
close(outputFile)
}
}
' Input_file
My data :
"1,2,3,4,5,64,3,9",,,,,1,aine
"2,3,4,5",,,,,3,bb
"3,4,5,6,6,2",,,,,2,ff
I have to transpose values inside "...." delimiter like this : how to transpose values two by two using shell?
and Output the result (2 columns) in a new file with the filename = (last-1) columns digits. I have to transpose for each lines of my input file.
What I would like :
$ ls
1 2 3 4 5 6 7 8
example : cat 1
1 2
3 4
5 64
3 9
cat 2 :
3 4
5 6
6 2
cat 3 :
2 3
4 5
Bonus : If I can get every last words (last columns) as title of new files It would be perfect.
Ok, it took a time but i finally solved your problem with the code below:
#!/bin/bash
while read -r LINE; do
FILE_NAME=$(echo {$LINE##*,,,,,} | cut -d ',' -f 1 | tr -d "\"")
DATA=$(echo ${LINE%%,,,,,*} | tr -d "\"" | tr "," " ")
touch $FILE_NAME
i=1
for num in $DATA ;do
echo -n "$num"
if [[ $(($i%2)) == 0 ]]; then
echo ""
else
echo -n " "
fi
i=$((i+1))
done > $FILE_NAME
done < input.txt
in my solution i imagine that your input should be placed in file input.txt and all of your input lines have ,,,,, as a separator. Works like a charm with your sample input.
Assuming there are no colons in the input (choose a different temporary delimiter if necessary) the first part can be done with:
awk '{s = ""; n = split($2,k,","); for(i = 1; i <= n; i+=2 ) { s = sprintf( "%s%c%s:%s", s, s ? ":" : "", k[i+1], k[i])} $2 = s}1' FS=\" OFS=\" input | sort -t , -k6n | tr : ,
eg:
$ cat input
"1,2,3,4,5,64,3,9",,,,,1,aine
"2,3,4,5",,,,,3,bb
"3,4,5,6,6,2",,,,,2,ff
$ awk '{s = ""; n = split($2,k,","); for(i = 1; i <= n; i+=2 ) { s = sprintf( "%s%c%s:%s", s, s ? ":" : "", k[i+1], k[i])} $2 = s}1' FS=\" OFS=\" input | sort -t , -k6n | tr : ,
"2,1,4,3,64,5,9,3",,,,,1,aine
"4,3,6,5,2,6",,,,,2,ff
"3,2,5,4",,,,,3,bb
But it's not clear why you want to do the first part at all when you can just skip straight to part 2 with:
awk '{n = split($2,k,","); m = split($3, j, ","); fname = j[6];
for( i = 1; i <= n; i+=2 ) printf("%d %d\n", k[i+1], k[i]) > fname}' FS=\" input
My answer can't keep up with the changes to the question! If you are outputting the lines into files, then there is no need to sort on the penultimate column. If you want the filenames to be the final column, it's not clear why you ever mentioned using the penultimate column at all. Just change fname in the above to j[7] to get the final column.
Sample input data:
Col1, Col2
120000,1261
120000,119879
120000,117737
120000,14051
200000,58411
200000,115292
300000,279892
120000,98572
250000,249598
120000,14051
......
I used Excel with follow steps:
Col3=Col2/Col1.
Format Col3 with percentage
Use countif to group by Col3
How to do this task with awk or other way in linux command line ?
Expected result:
percent|count
0-20% | 10
21-50% | 5
51-100%| 10
I calculated the percent but i'm still finding the way to group by Col3
cat input.txt |awk -F"," '$3=100*$2/$1'
awk approach:
awk 'BEGIN {
FS=",";
OFS="|";
}
(NR > 1){
percent = 100 * $2 / $1;
if (percent <= 20) {
a["0-20%"] += 1;
} else if (percent <= 50) {
a2 += 1;
a["21-50%"] += 1;
} else {
a["51-100%"] += 1;
}
}
END {
print "percent", "count"
for (i in a) {
print i, a[i];
}
}' data
Sample output:
percent|count
0-20%|3
21-50%|1
51-100%|6
A generic self documented. Need some fine tuning depending on group name in result (due to +1% or not but not the real purpose)
awk -F ',' -v Step='0|20|50|100' '
BEGIN {
# define group
Gn = split( Step, aEdge, "|")
}
NR>1{
# Define wich percent
L = $2 * 100 / ($1>0 ? $1 : 1)
# in which group
for( j=1; ( L < aEdge[j] || L >= aEdge[j+1] ) && j < Gn;) j++
# add to group
G[j]++
}
# print result ordered
END {
print "percent|count"
for( i=1;i<Gn;i++) printf( "%d-%d%%|%d\n", aEdge[i], aEdge[i+1], G[i])
}
' data
another awk with parametric bins and formatted output.
$ awk -F, -v OFS=\| -v bins='20,50,100' '
BEGIN {n=split(bins,b)}
NR>1 {for(i=1;i<=n;i++)
if($2/$1 <= b[i]/100)
{a[b[i]]++; next}}
END {print "percent","count";
b[0]=-1;
for(i=1;i<=n;i++)
printf "%-7s|%3s\n", b[i-1]+1"-"b[i]"%",a[b[i]]}' file
percent|count
0-20% | 3
21-50% | 1
51-100%| 6
Pure bash:
# arguments are histogram boundaries *in ascending order*
hist () {
local lower=0$(printf '+(val*100>sum*%d)' "$#") val sum count n;
set -- 0 "$#" 100;
read -r
printf '%7s|%5s\n' percent count;
while IFS=, read -r sum val; do echo $((lower)); done |
sort -n | uniq -c |
while read count n; do
printf '%2d-%3d%%|%5d\n' "${#:n+1:2}" $count;
done
}
Example:
$ hist 20 50 < csv.dat
percent|count
0- 20%| 3
20- 50%| 1
50-100%| 6
Potential Issue: Does not print intervals with no values:
$ hist 20 25 45 50 < csv.dat
percent|count
0- 20%| 3
25- 45%| 1
50-100%| 6
Explanation:
lower is set to an expression which will count the number of percentages less than 100*val/num
The list of intervals is augmented with 0 and 100 so that the limits print correctly
The header line is ignored
The output header is printed
For each csv row, read the variables $num and $val and send the numeric evaluation of $lower (which uses those variables) to...
count the number of instances of each interval count...
and print the interval and count
Another, in GNU awk, using switch and regex to identify the values (since parsing was tagged in OP):
NR>1{
switch(p=$2/$1){
case /0\.[01][0-9]|\.20/:
a["0-20%"]++;
break;
case /\.[2-4][0-9]|\.50/:
a["21-50%"]++;
break;
default:
a["51-100%"]++
}
}
END{ for(i in a)print i, a[i] }
Run it:
$ awk -F, -f program.awk file
21-50% 1
0-20% 3
51-100% 6
I'm struggling to reformat a comma separated file using awk. The file contains minute data for a day for multiple servers and for multiple metrics
e.g 2 records, per minute, per server for 24hrs
Example input file:
server01,00:01:00,AckDelayAverage,9999
server01,00:01:00,AckDelayMax,8888
server01,00:02:00,AckDelayAverage,666
server01,00:02:00,AckDelayMax,5555
.....
server01,23:58:00,AckDelayAverage,4545
server01,23:58:00,AckDelayMax,8777
server01,23:59:00,AckDelayAverage,4686
server01,23:59:00,AckDelayMax,7820
server02,00:01:00,AckDelayAverage,1231
server02,00:01:00,AckDelayMax,4185
server02,00:02:00,AckDelayAverage,1843
server02,00:02:00,AckDelayMax,9982
.....
server02,23:58:00,AckDelayAverage,1022
server02,23:58:00,AckDelayMax,1772
server02,23:59:00,AckDelayAverage,1813
server02,23:59:00,AckDelayMax,9891
I'm trying to re-format the file to have a single row for each minute with a unique concatenation of fields 1 & 3 as the column headers
e.g the expected output file would look like:
Minute, server01-AckDelayAverage,server01-AckDelayMax, server02-AckDelayAverage,server02-AckDelayMax
00:01:00,9999,8888,1231,4185
00:02:00,666,5555,1843,8892
...
...
23:58:00,4545,8777,1022,1772
23:59:00,4686,7820,1813,9891
A solution using GNU awk. Call this as awk -F, -f script input_file:
/Average/ { average[$2, $1] = $4; }
/Max/ { maximum[$2, $1] = $4; }
{
if (!($2 in minutes)) {
minutes[$2] = 1;
}
if (!($1 in servers)) {
servers[$1] = 1;
}
}
END {
mcount = asorti(minutes, smin);
scount = asorti(servers, sserv);
printf "minutes";
for (col = 1; col <= scount; col++) {
printf "," sserv[col] "-average," sserv[col] "-maximum";
}
print "";
for (row = 1; row <= mcount; row++) {
key = smin[row];
printf key;
for (col = 1; col <= scount; col++) {
printf "," average[key, sserv[col]] "," maximum[key, sserv[col]];
}
print "";
}
}
run awk command : ./script.awk file
#! /bin/awk -f
BEGIN{
FS=",";
OFS=","
}
$1 ~ /server01/ && $3 ~ /Average/{
a[$2]["Avg01"] = $4;
}
$1 ~ /server01/ && $3 ~ /Max/{
a[$2]["Max01"] = $4;
}
$1 ~ /server02/ && $3 ~ /Average/{
a[$2]["Avg02"] = $4;
}
$1 ~ /server02/ && $3 ~ /Max/{
a[$2]["Max02"] = $4;
}
END{
print "Minute","server01-AckDelayAverage","server01-AckDelayMax","server02-AckDelayAverage","server02-AckDelayMax"
for(i in a){
print i,a[i]["Avg01"],a[i]["Max01"],a[i]["Avg02"],a[i]["Max02"] | "sort"
}
}
With awk and sort:
awk -F, -v OFS=, '{
a[$2]=(a[$2]?a[$2]","$4:$4)
}
END{
for ( i in a ) print i,a[i]
}' File | sort
If $4 has 0 values:
awk -F, -v OFS=, '!a[$2]{a[$2]=$2} {a[$2]=a[$2]","$4} END{for ( i in a ) print a[i]}' | sort
!a[$2]{a[$2]=$2}: If array with a with Index $2 ( the time in Minute) doesn't exit, array a with index as $2( the time in Minute) with value as $2 is created. True when Minute entry first time occurs in line.
{a[$2]=a[$2]","$4}: Concatenate value $4 to this array
END: Print all values of in array a
Finally pipe this awk result to sort.
I have a file BLACK.FUL.eg2:
10>BLACK.FUL>272/GSMA/000000>151006>01
15>004401074905590>004401074905590>B>I>0011>Insert>240/PLMN/000100>>5000-K525122-15
15>004402145955010>004402145955010>B>I>0011>Insert>240/PLMN/000100>>1200-K108534-14
15>004402146016260>004402146016360>B>I>0011>Insert>240/PLMN/000100>>1200-K-94878-14
15>004402452698630>004402452698630>B>I>0011>Insert>240/PLMN/000100>>5000-K538947-14
90>BLACK.FUL>272/GSMA/000000>151006>01>4
I've written this AWK script:
awk 'NR > 2 { print p } { p = $0 }' BLACK.FUL.eg2 | awk -F">" \
'{if (length($2) == 15) print substr($2,1,length($2)-1)","substr($3,1,length($3)-1)","$6","$8; \
else print $2","$3","$6","$8;}' | awk -F"," '{if ($2 == $1) print $1","$3","$4; \
else {if (length($1) > 14) {v = substr($1,9,6); t = substr($2,9,6); \
while(v <= t) print substr($2,1,8)v++substr($2,15,2)","$3","$4;} \
else {d = $1;while(d <= $2) print d++","$3","$4;}}}'
which gives me an output of:
00440107490559,0011,240/PLMN/000100
00440214595501,0011,240/PLMN/000100
440214601626,0011,240/PLMN/000100
440214601627,0011,240/PLMN/000100
440214601628,0011,240/PLMN/000100
440214601629,0011,240/PLMN/000100
440214601630,0011,240/PLMN/000100
440214601631,0011,240/PLMN/000100
440214601632,0011,240/PLMN/000100
440214601633,0011,240/PLMN/000100
440214601634,0011,240/PLMN/000100
440214601635,0011,240/PLMN/000100
440214601636,0011,240/PLMN/000100
00440245269863,0011,240/PLMN/000100
with one problem: the leading 0s of strings in field1, are automatically getting removed due to a numeric operation on them. So my actual expected output is:
00440107490559,0011,240/PLMN/000100
00440214595501,0011,240/PLMN/000100
00440214601626,0011,240/PLMN/000100
00440214601627,0011,240/PLMN/000100
00440214601628,0011,240/PLMN/000100
00440214601629,0011,240/PLMN/000100
00440214601630,0011,240/PLMN/000100
00440214601631,0011,240/PLMN/000100
00440214601632,0011,240/PLMN/000100
00440214601633,0011,240/PLMN/000100
00440214601634,0011,240/PLMN/000100
00440214601635,0011,240/PLMN/000100
00440214601636,0011,240/PLMN/000100
00440245269863,0011,240/PLMN/000100
For that I'm trying the below updated AWK script:
awk 'NR > 2 { print p } { p = $0 }' BLACK.FUL.eg2 | awk -F">" \
'{if (length($2) == 15) print substr($2,1,length($2)-1)","substr($3,1,length($3)-1)","$6","$8; \
else print $2","$3","$6","$8;}' | awk -F"," '{if ($2 == $1) print $1","$3","$4; \
else {if (length($1) > 14) {v = substr($1,9,6); t = substr($2,9,6); \
while(v <= t) print substr($2,1,8)v++substr($2,15,2)","$3","$4;} \
else {d = $1; for ( i=1;i<length($1);i++ ) if (substr($1,i++,1) == "0") \
{m=m"0"; else exit 1;}; while(d <= $2) print md++","$3","$4;}}}'
But getting an error:
awk: cmd. line:4: {m=m"0"; else exit 1;}; while(d <= $2) print md++","$3","$4;}}}
awk: cmd. line:4: ^ syntax error
Can you please highlight what I'm doing wrong to achieve the expected output. Modification only for my already existing AWK script will be of much help. Thanks
NOTE: The Leading 0s can be of any number of occcurence, not only 2 0s in every case as in the above example outputs.
since your field sizes are fixed, for the given example just change the last print statement to
$ awk ... printf "%014d,%s,%s\n",d++,$3,$4}}}'
00440107490559,0011,240/PLMN/000100
00440214595501,0011,240/PLMN/000100
00440214601626,0011,240/PLMN/000100
00440214601627,0011,240/PLMN/000100
00440214601628,0011,240/PLMN/000100
00440214601629,0011,240/PLMN/000100
00440214601630,0011,240/PLMN/000100
00440214601631,0011,240/PLMN/000100
00440214601632,0011,240/PLMN/000100
00440214601633,0011,240/PLMN/000100
00440214601634,0011,240/PLMN/000100
00440214601635,0011,240/PLMN/000100
00440214601636,0011,240/PLMN/000100
00440245269863,0011,240/PLMN/000100
UPDATE
if your field size is not fixed, you can capture the length (or desired length) and use the same pattern. Since your code is too complicated, I'm going to write a proof of concept which you can embed into your script.
this is essentially your problem, increment a zero padded number and the leading zeros dropped.
$ echo 0001 | awk '{$1++; print $1}'
2
this is the proposed solution with parametric length with zero padding.
$ echo 0001 | awk '{n=length($1); $1++; printf "%0"n"s\n", $1}'
0002