I found this code to split a CSV file using python.
I need to split 3,000,000 record CSV file when the column A changes.
I also need to add 2 more fields to the table
Blank (add a comma next to each line).
Add a date in the last field, but it should ask me for the date.
Would someone be able to help me add 2 thing to this code.
A prompt to add more fields
A prompt what should be in the field
I am copying the code from the link included earlier
#!/usr/bin/env python3
import binascii
import csv
import os.path
import sys
from tkinter.filedialog import askopenfilename, askdirectory
from tkinter.simpledialog import askinteger
def split_csv_file(f, dst_dir, keyfunc):
csv_reader = csv.reader(f)
csv_writers = {}
for row in csv_reader:
k = keyfunc(row)
if k not in csv_writers:
csv_writers[k] = csv.writer(open(os.path.join(dst_dir, k),
mode='w', newline=''))
csv_writers[k].writerow(row)
def get_args_from_cli():
input_filename = sys.argv[1]
column = int(sys.argv[2])
dst_dir = sys.argv[3]
return (input_filename, column, dst_dir)
def get_args_from_gui():
input_filename = askopenfilename(
filetypes=(('CSV', '.csv'),),
title='Select CSV Input File')
column = askinteger('Choose Table Column', 'Table column')
dst_dir = askdirectory(title='Select Destination Directory')
return (input_filename, column, dst_dir)
if __name__ == '__main__':
if len(sys.argv) == 1:
input_filename, column, dst_dir = get_args_from_gui()
elif len(sys.argv) == 4:
input_filename, column, dst_dir = get_args_from_cli()
else:
raise Exception("Invalid number of arguments")
with open(input_filename, mode='r', newline='') as f:
split_csv_file(f, dst_dir, lambda r: r[column-1]+'.csv')
# if the column has funky values resulting in invalid filenames
# replace the line from above with:
# split_csv_file(f, dst_dir, lambda r: binascii.b2a_hex(r[column-1].encode('utf-8')).decode('utf-8')+'.csv')
Thank you
had it written in VBS
basename = "csv_split_"
hasHeader = True 'Change to False if there is no header.
argCnt = WScript.Arguments.Count
If argCnt < 1 Then
WScript.Echo "Drag a CSV over this script to edit it."
WScript.Quit
End If
flnm = WScript.Arguments.Item(0)
set fs = WScript.CreateObject("Scripting.FileSystemObject")
set cv = fs.OpenTextFile (WScript.Arguments.Item(0))
If Not fs.FileExists(flnm) Or LCase(fs.GetExtensionName(flnm)) <> "csv" Then
WScript.Echo "This script is meant for CSV only."
WScript.Quit
End If
fol = fs.GetParentFolderName(flnm)
customValue = InputBox("What should the last column contain?", "Column Info")
Set pat = New RegExp
pat.Global = True
pat.IgnoreCase = True
pat.Pattern = "^(""[^""]*""|[^,]*)?,"
recentCol = ""
csvCount = 1
header = ""
cnt = 0
While Not cv.AtEndOfStream
cnt = cnt + 1
row = cv.ReadLine
If Right(row,1) <> "," Then: comma = ",": Else: comma = "": End If
Set col1 = pat.Execute(row)
col = col1.Item(0).Value
sameFile = true
If recentCol <> "" Then
If col <> recentCol And cnt > 2 Then
csvCount = csvCount + 1
sameFile = false
End If
Else
header = row & comma & """Off Peak"",""Effective Date"""
Set csv = fs.OpenTextFile(fol & "\" & basename & csvcount & ".csv", 8, True)
End If
recentCol = col
If Not samefile Then
csv.close
Set csv = fs.OpenTextFile(fol & "\" & basename & csvcount & ".csv", 8, True)
End If
If hasHeader And (1 = cnt or Not samefile) Then
csv.WriteLine(header)
Else
csv.WriteLine(row & comma & ",""" & customValue & """")
End if
Wend
csv.Close
Works Great!
Related
I have 2 worksheets that I'm trying to compare.
Problem is that I can't go row by row because on the second worksheet there are extra entries based on "Batch Size", please see the example below. Also we can found duplicated data or missing ones on the second worksheet.
example picture
I believe it would be a lot easier to find any discrepancies if I have the "Bolt ID"s already created on the first worksheet then just go down 1-by-1 on every row and find the corresponding row that includes the same "Bolt ID" somewhere on the second worksheet.
Based on Batch Size, if Batch Size = 0
Bolt ID = Program ID_Step Number
if Batch Size is bigger than 0 then (for example Batch Size = 4)
`Bolt ID = Program ID_Step Number_1
`Bolt ID = Program ID_Step Number_2
`Bolt ID = Program ID_Step Number_3
`Bolt ID = Program ID_Step Number_4`
Any help is much appreciated in advance
Thank you
#freeflow
Thank you, I've ended up adding "_0" to all "Bolt ID" where they were needed using the following formula:
=IF(LEN(A1)-LEN(SUBSTITUTE(A1,"_",""))>1,A1,IF(RIGHT(A1,LEN(A1)-FIND("#",SUBSTITUTE(A1,"_","#",LEN(A1)-LEN(SUBSTITUTE(A1,"_",""))),2))<>"0",A1&"_0",A1))
Then I had the consistent "Bolt ID" to work with as you recommended and I could use this loop to compare my 2 sheets:
lastrow = steps.Range("A" & steps.Rows.Count).End(xlUp).Row
counter = 2
Do While counter <= lastrow
If Not steps.Range("A" & counter).EntireRow.Hidden Then
sequence_id = steps.Range("A" & counter)
step_no = steps.Range("B" & counter)
descr = steps.Range("C" & counter)
type_of_op = steps.Range("D" & counter)
tool_id = steps.Range("E" & counter)
batch_size = steps.Range("H" & counter)
If steps.Range("C" & counter) <> "Scan Process Barcode" Then
If type_of_op = "Fastening" Then
fastening_id = 1
Else: fastening_id = 0
End If
Do While fastening_id <= batch_size
bolt_id = sequence_id & "_" & step_no & "_" & fastening_id
r = 0
With ActiveSheet.Range("P:P")
Set loc = .Cells.Find(bolt_id, , xlValues, xlWhole, , , True)
If loc Is Nothing Then
MsgBox bolt_id & " Not found"
Else
occured = 0
Do Until loc.Row <= r
colorrange = (loc.Address)
occured = occured + 1
If occured >= 1 Then
ActiveSheet.Range(colorrange).Interior.ColorIndex = 45
End If
r = loc.Row
Set loc = .FindNext(loc)
Loop
If occured = 1 Then
ActiveSheet.Range(colorrange).Interior.ColorIndex = 4
End If
End If
End With
fastening_id = fastening_id + 1
Loop
End If
End If
counter = counter + 1
Loop
All it does is just highlighting the duplicated values for me but offsetting loc.address I can compare the rest of the cells and let the code making decisions.
I am looking for some input and possible example for parsing a text file with the following format: (sorry not sure how to retain the formatting of the file in this text)
NAME ID FORMAT SHORT NAME
DESCRIPTION (this field is on the second row an indented by 5 spaces)
The first row (NAME, ID, FORMAT and SHORT NAME) always consist of just one row. The DESCRIPTION text may span multiple rows. In some cases, there is only a first row of NAME, ID, etc. without a corresponding DESCRIPTION row.
Here is an example of how the data looks in the file now:
NAME ID FORMAT SHORT NAME
DESCRIPTION
ABC 01 xx AB
abcdefg
hijklm
nopqrs
DEF 02 xx DE
abcedfg
hijklmnopqrst
GHI 03 xx.x GH
JKL 001 xx JKL
abcdef
ghijk
lmnopq
rstu
vwxyz
I would like to parse out the NAME, ID, FORMAT, SHORT NAME and DESCRIPTION into 5 separate columns in a csv or excel file for additional analysis. I don't care if the DESCRIPTION field is broken across multiple lines but it can also be concatenated into a single longer string.
Hope this all makes sense. Thanks in advance!
Providing the data for NAME,ID,FORMAT and SHORT NAME is aligned
beneath their header word then use those words on the first line
to calculate the start position and length of each field, then split
the lines into fields using Mid(). Join the description lines and write out to
the previous record before a new record is started. For example
Option Explicit
Sub ParseTextFile()
Const INFILE = "c:\temp\testfile.txt"
Const OUTFILE = "c:\temp\testfile.xlsx"
Dim wbOut As Workbook, ws As Worksheet, iRow As Long
Dim txt As String, ff As Integer, i As Integer, desc As String
Dim start(4) As Integer, length(4) As Integer
Dim count As Integer, msg As String
Set wbOut = Workbooks.Add
Set ws = wbOut.Sheets("Sheet1")
ws.Range("A1:E1") = Array("NAME", "ID", "FORMAT", "SHORT NAME", "DESCRIPTION")
ws.Columns("A:E").NumberFormat = "#"
iRow = 1
ff = FreeFile()
Open INFILE For Input As #ff
While Not EOF(ff)
count = count + 1
Line Input #ff, txt
If count = 1 Then
start(1) = InStr(1, txt, "NAME", vbTextCompare)
start(2) = InStr(1, txt, "ID", vbTextCompare)
start(3) = InStr(1, txt, "FORMAT", vbTextCompare)
start(4) = InStr(1, txt, "SHORT NAME", vbTextCompare)
For i = 1 To 3
length(i) = start(i + 1) - start(i)
Next
Else
If Left(txt, 1) = " " Then
desc = desc & Trim(txt) & " "
Else
' save the description from last record
ws.Cells(iRow, 5) = Trim(desc)
desc = ""
' new row
iRow = iRow + 1
length(4) = Len(txt) - start(4) + 1
For i = 1 To 4
ws.Cells(iRow, i) = Mid(txt, start(i), length(i))
Next
End If
End If
Wend
Close #ff
' final description
ws.Cells(iRow, 5) = Trim(desc)
' save result
ws.Columns("A:E").AutoFit
wbOut.Close True, OUTFILE
msg = count & " lines read from " & INFILE & vbCr & _
iRow - 1 & " rows written to " & OUTFILE
MsgBox msg, vbInformation
End Sub
Here is my code:
import openpyxl
wb = openpyxl.load_workbook("Bok1.xlsx", data_only=True)
ws = wb["Blad1"]
n = 0
for row in ws['A1:A100']:
for cell in row:
if cell.value == "Konto":
for hej in range(13):
n+=1
konto = cell.offset(row=n).value
if konto == None or isinstance(konto, str) == True:
pass
else:
if konto == 306888 or konto == 306889:
#derp = input("derpderpderp?: ")
#if derp == "y":
if konto == 306888 or konto == 306889:
kst = cell.offset(row=n, column = 1).value
proj = cell.offset(row=n, column = 2).value
vht = cell.offset(row=n, column = 3).value
motp = cell.offset(row=n, column = 4).value
fin = cell.offset(row=n, column = 5).value
text = cell.offset(row=n, column = 8).value
belopp = cell.offset(row=n, column = 9).value
print(konto)
print(kst)
print(proj)
print(vht)
print(motp)
As you can see by the # in the code, I have a input that is "dependent" on a loop. How could I write this code so that it is not looping over and asking for the input for every iteration?
What I would do is to keep a counter:
n = 0
cnt=1 # counter
for row in ws['A1:A100']:
# continue with your code
Then, when you get to your input :
if cnt==1:
derp = input("derpderpderp?: ")
cnt += 1
if derp == "y":
#continue with your code
This way, you ensure that your input is taken only on the first go.
I have excel files converted to txt. In some files, some columns are skipped. That is controlled by database:
file | remove_column
=======+===============
file1 | CASE NOTE
-------+---------------
file2 | Description
-------+---------------
file3 | Item | Address
Remove_Column has the header (1st row). If several columns should be skipped, they are delimited with '|'
I have to compare converted txt file with original excel file if they match. How can I read all columns except those showed in DB table?
I am using UFT 12.5. Reading Excel through Excel.Application or ADO.
Thnx)
UPD: Code I use:
I have columns hard-coded:
Select Case OrigFileName 'file names come from database
Case "Fees mm-yy.xls"
ColumnNames = Split("1,2,3,4,5,6,7,8,9,10,11,12,13", ",")
Case "Exp mm-yy.xls"
ColumnNames = Split("1,2,3,4,5,6,7,8,9,12,13,14,15,16,19,20", ",")
End Select
But there are 50 files, and the business might ask to remove or to add back any columns; also, new files are coming...(((
Dim fsox : Set fsox = CreateObject("Scripting.FileSystemObject")
Dim TargFileRead : Set TargFileRead = fsox.OpenTextFile(targetFile)
Dim OrgExcel : Set OrgExcel = CreateObject("Excel.Application")
OrgExcel.Workbooks.Open(originalfile)
Set vSheet = OrgExcel.WorkSheets(TabUse) 'excel sheet name, comes from database
print vSheet.UsedRange.Rows.Count
For rc = 1 To vSheet.UsedRange.Rows.Count
For coc = 0 To UBound(ColumnNames) 'column names hard-coded
cc = cInt(ColumnNames(coc))
vtext = vSheet.cells(rc,cc)
If NOT(vtext=ChrW(9)) Then
If vstring="" Then
vstring=vtext
Else
vstring = vstring&vbTab&vtext
End If
End If
If len(vstring)>0 Then
TargFileText = TargFileRead.ReadLine
Do
If Left(TargFileText, 1)=ChrW(9) Then
TargFileText = MID(TargFileText, 2)
Else
Exit Do
End If
Loop
Do
If RIGHT(TargFileText, 1)=ChrW(9) Then
TargFileText= mid(TargFileText,1,len(TargFileText)-1)
Else
Exit Do
End If
Loop
TargFileStr = Trim(TargFileText)
If trim(vstring) = trim(TargFileStr) Then
' print "match"
Else
print "-=Not Match=-"&VBNewLine&"txt:::"&trim(TargFileStr)&VBNewLine&"xls:::"&trim(vstring)
End If
End If
Next
I would suggest to replace the Switch statement with a function call that gives you the relevant columns for the sheet as an array. The logic which column is allowed is then put in another function. That should make the logic more flexible than fixed columns.
Function getColumns(OrigFileName as String) As String()
Dim lastCol As Integer
Dim ColumnNumbers As String
lastCol = Sheets(OrigFileName).UsedRange.Columns.Count
For col = 1 To lastCol
If isColumnAllowed(OrigFileName, Sheets(OrigFileName).Cells(1, col)) Then
ColumnNumbers = ColumnNumbers & IIf(Len(ColumnNumbers) = 0, "", ",") & col
End If
Next
getColumns = Split(ColumnNumbers, ",")
End Function
Function isColumnAllowed(ByVal OrigFileName As String, columnName As String) As Boolean
Select Case OrigFileName
Case "file1"
Forbidden = Split("CASE NOTE", "/")
Case "file2"
Forbidden = Split("Description", "/")
Case "file3"
Forbidden = Split("Item/ Address", "/")
End Select
isColumnAllowed = (UBound(Filter(Forbidden, columnName)) = -1)
End Function
This is what I have now and is working:
If LEN(ColumnToRemove)>0 Then
ColumnToRemoveCol = split(ColumnToRemove, "|") 'set collection of header strings to skip column
For L = 1 To vSheet.UsedRange.Columns.Count
For x = 0 to UBound(ColumnToRemoveCol)
AddCol = 0 'ColumnToRemoveCol can have more than 1 item, that may cause any column to be added more than once; we will use the true/false logic via 0 and 1 to avoid that doubling
If vSheet.cells(1, l)=ColumnToRemoveCol(x) Then
AddCol = AddCol + 1
End If
Next
If AddCol =0 Then ColumnNumbers = ColumnNumbers&","&L
Next
Else
For L = 1 To vSheet.UsedRange.Columns.Count
ColumnNumbers = ColumnNumbers&","&L
Next
End If
If LEFT(ColumnNumbers, 1)="," Then ColumnNumbers=MID(ColumnNumbers, 2)
If RIGHT(ColumnNumbers, 1)="," Then ColumnNumbers=MID(ColumnNumbers, 1, LEN(ColumnNumbers)-1)
Printing the columns for first excel file in my case gives the next line:
ColumnNumbers: 1,2,3,4,5,6,7,8,10,11,12,15,16,17
Further usage:
getColumns = Split(ColumnNumbers, ",")
For rc = 1 To vSheet.UsedRange.Rows.Count
For coc = 0 To UBound(getColumns)
cc = cInt(getColumns(coc))
vtext = vSheet.cells(rc,cc)
.....
Next
Next
I have two identical sheets that i want to take the rows of , that are identical in multiple columns (the sheets are 63 columns always and 504 rows and increasing) , i am using two for loops to increase the row in one and then comparing all the rows in the other with that row then increase the row again and compare all the rows of the other with that row ect. till the last row , then an if loop to see if they match my conditions . The problem is that it is taking too much time (about 8 mins) , i tried to use the lookup functions but it failed because it can only take one value . I added the false screenupdating , calculation , and enableevents and even changed the statusbar to something very basic to improve performance but non of them gave me the result I wanted .
How can i improve performance in any way possible , a new function or anything ??
PS some times some of the conditions are not important and it depends on the true or fasle values on some of the cells .
For Row_S = 2 To MAX_Row_S
SourceMonth = Worksheets(NBG_SourceRegionDataWorksheetName).Cells(Row_S, SOP).Value
SourceMonth = DatePart("m", SourceMonth)
SourceYear = Worksheets(NBG_SourceRegionDataWorksheetName).Cells(Row_S, SOP).Value
SourceYear = DatePart("yyyy", SourceYear)
SourceCarmaker = Worksheets(NBG_SourceRegionDataWorksheetName).Cells(Row_S, carmaker).Value
SourceProject = Worksheets(NBG_SourceRegionDataWorksheetName).Cells(Row_S, Project).Value
SourceFamily = Worksheets(NBG_SourceRegionDataWorksheetName).Cells(Row_S, Family).Value
SourceStatus = Worksheets(NBG_SourceRegionDataWorksheetName).Cells(Row_S, Status).Value
SourceShare = Worksheets(NBG_SourceRegionDataWorksheetName).Cells(Row_S, Share).Value
SourceCst = Worksheets(NBG_SourceRegionDataWorksheetName).Cells(Row_S, "A").Value
SourcePID = Worksheets(NBG_SourceRegionDataWorksheetName).Cells(Row_S, ProjectID).Value
' Take the data from NBG_Data_Region sheet to be Compared with each row of the NBG_Data_Source_Region sheet
For Row_T = 2 To MAX_Row_T
If Row_T >= MAX_Row_T Then
Exit For
End If
NBGMonth = Worksheets(NBG_RegionaDataWorksheetName).Cells(Row_T, SOP).Value
NBGMonth = DatePart("m", NBGMonth)
NBGYear = Worksheets(NBG_RegionaDataWorksheetName).Cells(Row_T, SOP).Value
NBGYear = DatePart("yyyy", NBGYear)
NBGCarmaker = Worksheets(NBG_RegionaDataWorksheetName).Cells(Row_T, carmaker).Value
NBGProject = Worksheets(NBG_RegionaDataWorksheetName).Cells(Row_T, Project).Value
NBGFamily = Worksheets(NBG_RegionaDataWorksheetName).Cells(Row_T, Family).Value
NBGStatus = Worksheets(NBG_RegionaDataWorksheetName).Cells(Row_T, Status).Value
NBGShare = Worksheets(NBG_RegionaDataWorksheetName).Cells(Row_T, Share).Value
NBGCst = Worksheets(NBG_RegionaDataWorksheetName).Cells(Row_T, "A").Value
NBGPID = Worksheets(NBG_RegionaDataWorksheetName).Cells(Row_T, ProjectID).Value
' StatusBar Show
Application.StatusBar = "Running"
'Application.StatusBar = "VerifyMultipleCustomerProjects. Progress: " & Row_S & " of " & MAX_Row_S
' Check if any project in the NBG_Data_Region have multiple customers and add it ti the sheet Issue_MultipleCustomerProjects
' NAF 20161208
'Test with Source of YEAR and MONTH
If ((NBGMonth = SourceMonth Or Worksheets(Issue_MultipleCustomerProjectsWorksheetName).Range("C21") = True) And _
(NBGYear = SourceYear Or Worksheets(Issue_MultipleCustomerProjectsWorksheetName).Range("C25") = True) And _
(SourceCarmaker = NBGCarmaker Or Worksheets(Issue_MultipleCustomerProjectsWorksheetName).Range("G25") = True) And _
(SourceProject = NBGProject Or Worksheets(Issue_MultipleCustomerProjectsWorksheetName).Range("F25") = True) And _
(SourceFamily = NBGFamily Or Worksheets(Issue_MultipleCustomerProjectsWorksheetName).Range("E25") = True) And _
(SourceShare + NBGShare <> 1 Or Worksheets(Issue_MultipleCustomerProjectsWorksheetName).Range("H25") = True) And NBGCst <> SourceCst) Then
Have you tried adding
Application.ScreenUpdating = False
Application.EnableEvents = False
Application.DisplayAlerts = False
at the beginning of your code, and
Application.ScreenUpdating = True
Application.EnableEvents = True
Application.DisplayAlerts = True
at the end of your code?
This will turn off screen updating, events, and alerts causing faster run-time.
Also, loading and unloading arrays are the fastest way if you decide to take that route.
An example of loading an array:
Dim arr() As Variant ' let brackets empty, not Dim arr(1) As Variant !
For Each a In Range.Cells
' change / adjust the size of array
ReDim Preserve arr(1 To UBound(arr) + 1) As Variant
' add value on the end of the array
arr(UBound(arr)) = a.Value
Next
An example of iterating through the array to pull your data:
For Each element In arr 'Each array element
do_something (element)
Next element