I have a code which looks like this, it is supposed to collect some custom metrics and expose it over prometheus.
def collect_metrics():
registry = prometheus_client.CollectorRegistry()
label_names = ['parent', 'namespace','team', 'name', 'status']
sib = Gauge(f'disk_sizeInBytes','Gets the size of the disk in bytes.', label_names, registry=registry)
msib = Gauge(f'disk_maxSizeInMegabytes', 'Gets or sets the maximum size of the disk in megabytes, which is the size of memory allocated for the disk.', label_names, registry=registry)
...
sib.labels(parent=parent_name, namespace=namespace_name, team=team, name=disk_name, status=disk_status).set(disk_list[dp]["sizeInBytes"])
msib.labels(parent=parent_name, namespace=namespace_name, team=team, name=disk_name, status=disk_status).set(disk_list[dp]["maxSizeInMegabytes"])
print(f'{datetime.datetime.now()} | disk_name: {disk_name} | sib: {disk_list[dp]["sizeInBytes"]} | msib: {disk_list[dp]["maxSizeInMegabytes"]}')
...
if __name__ == '__main__':
...
start_htdp_server(8005)
collect_metrics()
The code works fine without any errors, however I don’t see anything being shown over endpoint http://localhost:8005/, though i see some default metrics being shown such as:
# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 403.0
python_gc_objects_collected_total{generation="1"} 0.0
python_gc_objects_collected_total{generation="2"} 0.0
# HELP python_gc_objects_uncollectable_total Uncollectable object found during GC
# TYPE python_gc_objects_uncollectable_total counter
python_gc_objects_uncollectable_total{generation="0"} 0.0
python_gc_objects_uncollectable_total{generation="1"} 0.0
python_gc_objects_uncollectable_total{generation="2"} 0.0
# HELP python_gc_collections_total Number of times this generation was collected
# TYPE python_gc_collections_total counter
python_gc_collections_total{generation="0"} 39.0
python_gc_collections_total{generation="1"} 3.0
python_gc_collections_total{generation="2"} 0.0
# HELP python_info Python platform information
# TYPE python_info gauge
python_info{implementation="CPython",major="3",minor="10",patchlevel="4",version="3.10.4"} 1.0
Can someone point me, what is the issue here?
Couple of things:
Remove registry = prometheus_client.CollectorRegistry()
Remove registry=registry from the Gauge declarations
Add a loop to keep the process running.
import datetime
import re
import time
from prometheus_client import CollectorRegistry,Gauge
from prometheus_client import start_http_server
def collect_metrics():
label_names = ['parent', 'namespace','team', 'name', 'status']
sib = Gauge(
'disk_sizeInBytes',
'Gets the size of the disk in bytes.',
label_names,
)
msib = Gauge(
'disk_maxSizeInMegabytes',
'Gets or sets the maximum size of the disk in megabytes, which is the size of memory allocated for the disk.',
label_names,
)
sib.labels(
parent="parent_name",
namespace="namespace_name",
team="team",
name="disk_name",
status="disk_status",
).set(10.0)
msib.labels(
parent="parent_name",
namespace="namespace_name",
team="team",
name="disk_name",
status="disk_status",
).set(5.0)
if __name__ == '__main__':
...
start_http_server(8005)
collect_metrics()
while True:
time.sleep(5)
# HELP disk_sizeInBytes Gets the size of the disk in bytes.
# TYPE disk_sizeInBytes gauge
disk_sizeInBytes{name="disk_name",namespace="namespace_name",parent="parent_name",status="disk_status",team="team"} 10.0
# HELP disk_maxSizeInMegabytes Gets or sets the maximum size of the disk in megabytes, which is the size of memory allocated for the disk.
# TYPE disk_maxSizeInMegabytes gauge
disk_maxSizeInMegabytes{name="disk_name",namespace="namespace_name",parent="parent_name",status="disk_status",team="team"} 5.0
Related
I'm working on multiple well-formed xml files, whose sizes range from 100 MB to 4 GB. My goal is to read them as strings and then import them as ElementTree objects using .fromstring() method (from xml.etree.ElementTree module).
However, as the process goes through and the string size increases, two exceptions occured related to memory restriction :
xml.etree.ElementTree.ParseError: out of memory: line 1, column 0
OverflowError: size does not fit in an int
It looks like .fromstring() method enforces a string size limit to the input, around 1GB... ?
To debug this, I wrote a short script using a for loop:
xmlFiles_list = [path1, path2, ...]
for fp in xmlFiles_list:
xml_fo = open(fp, mode='r', encoding="utf-8")
xml_asStr = xml_fo.read()
xml_fo.close()
print(len(xml_asStr.encode("utf-8")) / 10**9) # display string size in GB
try:
etree = cElementTree.fromstring(xml_asStr)
print(".fromstring() success!\n")
except Exception as e:
print(f"Error :{type(e)} {str(e)}\n")
continue
The ouput is as following :
0.895206753
.fromstring() success!
1.220224531
Error :<class 'xml.etree.ElementTree.ParseError'> out of memory: line 1, column 0
1.328233473
Erreur :<class 'xml.etree.ElementTree.ParseError'> out of memory: line 1, column 0
2.567867904
Error :<class 'OverflowError'> size does not fit in an int
4.080672538
Error :<class 'OverflowError'> size does not fit in an int
I found multiple workarounds to avoid this issue : .parse() method or lxml module for bette performance. I just hope someone could shed some light on this :
Is there a specific string size limit in xml.etree.ET module and .fromstring() method ?
Why do I end up with two different exceptions as the string size increases ? Are they related to the same memory-allocation restriction ?
Python version/system: 3.9 (64 bits)
RAM : 32go
Hope my topic is clear enough, I'm new on stackoverflow
I am trying to do clustering with CLARA using Rstudio on Linux and I have a very large dataset.
However, it seemed that the memory is not enough for the whole dataset?
## Estimating the number of clusters ----
fviz_nbclust(df, clara, method = "silhouette", k.max = 15)
It showed me this:
Error: cannot allocate vector of size 339.8 GB
So I tried all of this and it still didn't work. memory.limit is also specific for Windows only (I still gave it a try tho).
# devtools::install_github("krlmlr/ulimit")
# gc()
# memory.limit(9999999999)
#
#
# install.packages("devtools", dependencies = TRUE)
# devtools::install_github("krlmlr/ulimit")
# ulimit::memory_limit(2000)
#
# devtools::install_github("jeroen/unix")
#
#
# if(.Platform$OS.type == "windows") withAutoprint({
# memory.size()
# memory.size(TRUE)
# memory.limit()
# })
# memory.limit(size=56000)
# memory.size(max = FALSE)
Can somebody help me with this?
Any help would be appreciated!
The error simply means that it cannot allocate 339.8 GB to your RAM. Do you have 360GB of RAM?
If not, you will just have to dplyr::nsample() and just run the function on a subset of your dataset.
I have spent considerable time trying to debug some pytorch code which I have created a minimal example of for the purpose of helping to better understand what the issue might be.
I have removed all necessary portions of the code which are unrelated to the issue so the remaining piece of code won't make much sense from a functional standpoint but it still displays the error I'm facing.
The overall task I'm working on is in a loop and every pass of the loop is computing the embedding of the image and adding it to a variable storing it. It's effectively aggregating it (not concatenating, so the size remains the same). I don't expect the number of iterations to force the datatype to overflow, I don't see this happening here nor in my code.
I have added multiple metrics to evaluate the size of the tensors I'm working with to make sure they're not growing in memory footprint
I'm checking the overall GPU memory usage to verify the issue leading to the final RuntimeError: CUDA out of memory..
My environment is as follows:
- python 3.6.2
- Pytorch 1.4.0
- Cudatoolkit 10.0
- Driver version 410.78
- GPU: Nvidia GeForce GT 1030 (2GB VRAM)
(though I've replicated this experiment with the same result on a Titan RTX with 24GB,
same pytorch version and cuda toolkit and driver, it only goes out of memory further in the loop).
Complete code below. I have marked 2 lines as culprits, as deleting them removes the issue, though obviously I need to find a way to execute them without having memory issues. Any help would be much appreciated! You may try with any image named "source_image.bmp" to replicate the issue.
import torch
from PIL import Image
import torchvision
from torchvision import transforms
from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"]='0' # this is necessary on my system to allow the environment to recognize my nvidia GPU for some reason
os.environ['CUDA_LAUNCH_BLOCKING'] = '1' # to debug by having all CUDA functions executed in place
torch.set_default_tensor_type('torch.cuda.FloatTensor')
# Preprocess image
tfms = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),])
img = tfms(Image.open('source_image.bmp')).unsqueeze(0).cuda()
model = torchvision.models.resnet50(pretrained=True).cuda()
model.eval() # we put the model in evaluation mode, to prevent storage of gradient which might accumulate
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'Total available memory : {info.total / 1000000000}')
feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])
orig_embedding = feature_extractor(img)
embedding_depth = 2048
mem0 = 0
embedding = torch.zeros(2048, img.shape[2], img.shape[3]) #, dtype=torch.float)
patch_size=[4,4]
patch_stride=[2,2]
patch_value=0.0
# Here, we iterate over the patch placement, defined at the top left location
for row in range(img.shape[2]-1):
for col in range(img.shape[3]-1):
print("######################################################")
######################################################
# Isolated line, culprit 1 of the GPU memory leak
######################################################
patched_embedding = feature_extractor(img)
delta_embedding = (patched_embedding - orig_embedding).view(-1, 1, 1)
######################################################
# Isolated line, culprit 2 of the GPU memory leak
######################################################
embedding[:,row:row+1,col:col+1] = torch.add(embedding[:,row:row+1,col:col+1], delta_embedding)
print("img size:\t\t", img.element_size() * img.nelement())
print("patched_embedding size:\t", patched_embedding.element_size() * patched_embedding.nelement())
print("delta_embedding size:\t", delta_embedding.element_size() * delta_embedding.nelement())
print("Embedding size:\t\t", embedding.element_size() * embedding.nelement())
del patched_embedding, delta_embedding
torch.cuda.empty_cache()
info = nvmlDeviceGetMemoryInfo(h)
print("\nMem usage increase:\t", info.used / 1000000000 - mem0)
mem0 = info.used / 1000000000
print(f'Free:\t\t\t {(info.total - info.used) / 1000000000}')
print("Done.")
Add this to your code as soon as you load the model
for param in model.parameters():
param.requires_grad = False
from https://pytorch.org/docs/stable/notes/autograd.html#excluding-subgraphs-from-backward
Hi we have mysql master slave replication, master is mysql 5.6 and slave is mysql 5.7, seconds behind master is 245000, how I make it catch up faster. Right now it is taking more than 6 hours to copy 100 000 seconds.
My slave ram is 128 GB. Below is my my.cnf
[mysqld]
# Remove leading # and set to the amount of RAM for the most important data
# cache in MySQL. Start at 70% of total RAM for dedicated server, else 10%.
innodb_buffer_pool_size = 110G
# Remove leading # to turn on a very important data integrity option: logging
# changes to the binary log between backups.
# log_bin
# These are commonly set, remove the # and set as required.
basedir = /usr/local/mysql
datadir = /disk1/mysqldata
port = 3306
#server_id = 3
socket = /var/run/mysqld/mysqld.sock
user=mysql
log_error = /var/log/mysql/error.log
# Remove leading # to set options mainly useful for reporting servers.
# The server defaults are faster for transactions and fast SELECTs.
# Adjust sizes as needed, experiment to find the optimal values.
join_buffer_size = 256M
sort_buffer_size = 128M
read_rnd_buffer_size = 2M
#copied from old config
#key_buffer = 16M
max_allowed_packet = 256M
thread_stack = 192K
thread_cache_size = 8
query_cache_limit = 1M
#disabling query_cache_size and type, for replication purpose, need to enable it when going live
query_cache_size = 0
#query_cache_size = 64M
#query_cache_type = 1
query_cache_type = OFF
#GroupBy
sql_mode=STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION
#sql_mode=NO_ENGINE_SUBSTITUTION,STRICT_TRANS_TABLES
enforce-gtid-consistency
gtid-mode = ON
log_slave_updates=0
slave_transaction_retries = 100
#replication related changes
server-id = 2
relay-log = /disk1/mysqllog/mysql-relay-bin.log
log_bin = /disk1/mysqllog/binlog/mysql-bin.log
binlog_do_db = brandmanagement
#replicate_wild_do_table=brandmanagement.%
replicate-wild-ignore-table=brandmanagement.t\_gnip\_data\_recent
replicate-wild-ignore-table=brandmanagement.t\_gnip\_data
replicate-wild-ignore-table=brandmanagement.t\_fb\_rt\_data
replicate-wild-ignore-table=brandmanagement.t\_keyword\_tweets
replicate-wild-ignore-table=brandmanagement.t\_gnip\_data\_old
replicate-wild-ignore-table=brandmanagement.t\_gnip\_data\_new
binlog_format=row
report-host=10.125.133.220
report-port=3306
#sync-master-info=1
read-only=1
net_read_timeout = 7200
net_write_timeout = 7200
innodb_flush_log_at_trx_commit = 2
sync_binlog=0
sync_relay_log_info=0
max_relay_log_size=268435456
Lots of possible solutions. But I'll go with the simplest one. Have you got enough network bandwidth to send all changes over the network? You're using "row" binlog, which may be good in case of random, unindexed updates. But if you're changing a lot of data using indexes only, then "mixed" binlog may be better.
I'm trying to parallelize a script that prints out how many documents, pictures and videos there are in a directory as well as some other informations. I've put the serial script at the end of this message. Here's one example that shows how it outputs the informations about the directory given :
7 documents use 110.4 kb ( 1.55 % of total size)
2 pictures use 6.8 Mb ( 98.07 % of total size)
0 videos use 0.0 bytes ( 0.00 % of total size)
9 others use 26.8 kb ( 0.38 % of total size)
Now, I would like to use threads to minimize the execution time. I've tried this :
import threading
import tools
import time
import os
import os.path
directory_path="Users/usersos/Desktop/j"
cv=threading.Lock()
type_=["documents","pictures","videos"]
e={}
e["documents"]=[".pdf",".html",".rtf",".txt"]
e["pictures"]=[".png",".jpg",".jpeg"]
e["videos"]=[".mpg",".avi",".mp4",".mov"]
class type_thread(threading.Thread):
def __init__(self,n,e_):
super().__init__()
self.extensions=e_
self.name=n
def __run__(self):
files=tools.g(directory_path,self.extensions)
n=len(files)
s=tools.size1(files)
p=s*100/tools.size2(directory_path)
cv.acquire()
print("{} {} use {} ({:10.2f} % of total size)".format(n,self.name,tools.compact(s),p))
cv.release()
types=[type_thread(t,e[t]) for t in type_]
for t in types:
t.start()
for t in types:
t.join()
When I run that, nothing is printed out ! And when I key in 't'+'return key' in the interpreter, I get <type_thread(videos, stopped 4367323136)> What's more, sometimes the interpreter returns the right statistics with these same keys.
Why is that ?
Initial script (serial) :
import tools
import time
import os
import os.path
type_=["documents","pictures","videos"]
all_=type_+["others"]
e={}
e["documents"]=[".pdf",".html",".rtf",".txt"]
e["pictures"]=[".png",".jpg",".jpeg"]
e["videos"]=[".mpg",".avi",".mp4",".mov"]
def statistic(directory_path):
#----------------------------- Computing ---------------------------------
d={t:tools.g(directory_path,e[t]) for t in type_}
d["others"]=[os.path.join(root,f) for root, _, files_names in os.walk(directory_path) for f in files_names if os.path.splitext(f)[1].lower() not in e["documents"]+e["pictures"]+e["videos"]]
n={t:len(d[t]) for t in type_}
n["others"]=len(d["others"])
s={t:tools.size1(d[t]) for t in type_}
s["others"]=tools.size1(d["others"])
s_dir=tools.size2(directory_path)
p={t:s[t]*100/s_dir for t in type_}
p["others"]=s["others"]*100/s_dir
#----------------------------- Printing ---------------------------------
for t in all_:
print("{} {} use {} ({:10.2f} % of total size)".format(n[t],t,tools.compact(s[t]),p[t]))
return s_dir
Method start() seems not to work. When I replace
for t in types:
t.start()
for t in types:
t.join()
with
for t in types:
t.__run__()
It works fine (at least for now, I don't know if it will still when I'll add other commands).